Blame - vendor/github.com/coreos/etcd/etcdserver/raft.go - voltha-go

blob: 212dccbdcc8af393d279725dd8fb3e7799727c7f [file] [log] [blame]

khenaidoo	d948f77	2021-08-11 17:49:24 -0400	[diff] [blame]	1	// Copyright 2015 The etcd Authors
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	package etcdserver
				16
				17	import (
				18	"encoding/json"
				19	"expvar"
				20	"sort"
				21	"sync"
				22	"sync/atomic"
				23	"time"
				24
				25	pb "github.com/coreos/etcd/etcdserver/etcdserverpb"
				26	"github.com/coreos/etcd/etcdserver/membership"
				27	"github.com/coreos/etcd/pkg/contention"
				28	"github.com/coreos/etcd/pkg/pbutil"
				29	"github.com/coreos/etcd/pkg/types"
				30	"github.com/coreos/etcd/raft"
				31	"github.com/coreos/etcd/raft/raftpb"
				32	"github.com/coreos/etcd/rafthttp"
				33	"github.com/coreos/etcd/wal"
				34	"github.com/coreos/etcd/wal/walpb"
				35	"github.com/coreos/pkg/capnslog"
				36	)
				37
				38	const (
				39	// Number of entries for slow follower to catch-up after compacting
				40	// the raft storage entries.
				41	// We expect the follower has a millisecond level latency with the leader.
				42	// The max throughput is around 10K. Keep a 5K entries is enough for helping
				43	// follower to catch up.
				44	numberOfCatchUpEntries = 5000
				45
				46	// The max throughput of etcd will not exceed 100MB/s (100K * 1KB value).
				47	// Assuming the RTT is around 10ms, 1MB max size is large enough.
				48	maxSizePerMsg = 1 * 1024 * 1024
				49	// Never overflow the rafthttp buffer, which is 4096.
				50	// TODO: a better const?
				51	maxInflightMsgs = 4096 / 8
				52	)
				53
				54	var (
				55	// protects raftStatus
				56	raftStatusMu sync.Mutex
				57	// indirection for expvar func interface
				58	// expvar panics when publishing duplicate name
				59	// expvar does not support remove a registered name
				60	// so only register a func that calls raftStatus
				61	// and change raftStatus as we need.
				62	raftStatus func() raft.Status
				63	)
				64
				65	func init() {
				66	raft.SetLogger(capnslog.NewPackageLogger("github.com/coreos/etcd", "raft"))
				67	expvar.Publish("raft.status", expvar.Func(func() interface{} {
				68	raftStatusMu.Lock()
				69	defer raftStatusMu.Unlock()
				70	return raftStatus()
				71	}))
				72	}
				73
				74	type RaftTimer interface {
				75	Index() uint64
				76	Term() uint64
				77	}
				78
				79	// apply contains entries, snapshot to be applied. Once
				80	// an apply is consumed, the entries will be persisted to
				81	// to raft storage concurrently; the application must read
				82	// raftDone before assuming the raft messages are stable.
				83	type apply struct {
				84	entries []raftpb.Entry
				85	snapshot raftpb.Snapshot
				86	// notifyc synchronizes etcd server applies with the raft node
				87	notifyc chan struct{}
				88	}
				89
				90	type raftNode struct {
				91	// Cache of the latest raft index and raft term the server has seen.
				92	// These three unit64 fields must be the first elements to keep 64-bit
				93	// alignment for atomic access to the fields.
				94	index uint64
				95	term uint64
				96	lead uint64
				97
				98	tickMu *sync.Mutex
				99	raftNodeConfig
				100
				101	// a chan to send/receive snapshot
				102	msgSnapC chan raftpb.Message
				103
				104	// a chan to send out apply
				105	applyc chan apply
				106
				107	// a chan to send out readState
				108	readStateC chan raft.ReadState
				109
				110	// utility
				111	ticker *time.Ticker
				112	// contention detectors for raft heartbeat message
				113	td *contention.TimeoutDetector
				114
				115	stopped chan struct{}
				116	done chan struct{}
				117	}
				118
				119	type raftNodeConfig struct {
				120	// to check if msg receiver is removed from cluster
				121	isIDRemoved func(id uint64) bool
				122	raft.Node
				123	raftStorage *raft.MemoryStorage
				124	storage Storage
				125	heartbeat time.Duration // for logging
				126	// transport specifies the transport to send and receive msgs to members.
				127	// Sending messages MUST NOT block. It is okay to drop messages, since
				128	// clients should timeout and reissue their messages.
				129	// If transport is nil, server will panic.
				130	transport rafthttp.Transporter
				131	}
				132
				133	func newRaftNode(cfg raftNodeConfig) *raftNode {
				134	r := &raftNode{
				135	tickMu: new(sync.Mutex),
				136	raftNodeConfig: cfg,
				137	// set up contention detectors for raft heartbeat message.
				138	// expect to send a heartbeat within 2 heartbeat intervals.
				139	td: contention.NewTimeoutDetector(2 * cfg.heartbeat),
				140	readStateC: make(chan raft.ReadState, 1),
				141	msgSnapC: make(chan raftpb.Message, maxInFlightMsgSnap),
				142	applyc: make(chan apply),
				143	stopped: make(chan struct{}),
				144	done: make(chan struct{}),
				145	}
				146	if r.heartbeat == 0 {
				147	r.ticker = &time.Ticker{}
				148	} else {
				149	r.ticker = time.NewTicker(r.heartbeat)
				150	}
				151	return r
				152	}
				153
				154	// raft.Node does not have locks in Raft package
				155	func (r *raftNode) tick() {
				156	r.tickMu.Lock()
				157	r.Tick()
				158	r.tickMu.Unlock()
				159	}
				160
				161	// start prepares and starts raftNode in a new goroutine. It is no longer safe
				162	// to modify the fields after it has been started.
				163	func (r raftNode) start(rh raftReadyHandler) {
				164	internalTimeout := time.Second
				165
				166	go func() {
				167	defer r.onStop()
				168	islead := false
				169
				170	for {
				171	select {
				172	case <-r.ticker.C:
				173	r.tick()
				174	case rd := <-r.Ready():
				175	if rd.SoftState != nil {
				176	newLeader := rd.SoftState.Lead != raft.None && atomic.LoadUint64(&r.lead) != rd.SoftState.Lead
				177	if newLeader {
				178	leaderChanges.Inc()
				179	}
				180
				181	if rd.SoftState.Lead == raft.None {
				182	hasLeader.Set(0)
				183	} else {
				184	hasLeader.Set(1)
				185	}
				186
				187	atomic.StoreUint64(&r.lead, rd.SoftState.Lead)
				188	islead = rd.RaftState == raft.StateLeader
				189	if islead {
				190	isLeader.Set(1)
				191	} else {
				192	isLeader.Set(0)
				193	}
				194	rh.updateLeadership(newLeader)
				195	r.td.Reset()
				196	}
				197
				198	if len(rd.ReadStates) != 0 {
				199	select {
				200	case r.readStateC <- rd.ReadStates[len(rd.ReadStates)-1]:
				201	case <-time.After(internalTimeout):
				202	plog.Warningf("timed out sending read state")
				203	case <-r.stopped:
				204	return
				205	}
				206	}
				207
				208	notifyc := make(chan struct{}, 1)
				209	ap := apply{
				210	entries: rd.CommittedEntries,
				211	snapshot: rd.Snapshot,
				212	notifyc: notifyc,
				213	}
				214
				215	updateCommittedIndex(&ap, rh)
				216
				217	select {
				218	case r.applyc <- ap:
				219	case <-r.stopped:
				220	return
				221	}
				222
				223	// the leader can write to its disk in parallel with replicating to the followers and them
				224	// writing to their disks.
				225	// For more details, check raft thesis 10.2.1
				226	if islead {
				227	// gofail: var raftBeforeLeaderSend struct{}
				228	r.transport.Send(r.processMessages(rd.Messages))
				229	}
				230
				231	// Must save the snapshot file and WAL snapshot entry before saving any other entries or hardstate to
				232	// ensure that recovery after a snapshot restore is possible.
				233	if !raft.IsEmptySnap(rd.Snapshot) {
				234	// gofail: var raftBeforeSaveSnap struct{}
				235	if err := r.storage.SaveSnap(rd.Snapshot); err != nil {
				236	plog.Fatalf("failed to save Raft snapshot %v", err)
				237	}
				238	// gofail: var raftAfterSaveSnap struct{}
				239	}
				240
				241	// gofail: var raftBeforeSave struct{}
				242	if err := r.storage.Save(rd.HardState, rd.Entries); err != nil {
				243	plog.Fatalf("failed to raft save state and entries %v", err)
				244	}
				245	if !raft.IsEmptyHardState(rd.HardState) {
				246	proposalsCommitted.Set(float64(rd.HardState.Commit))
				247	}
				248	// gofail: var raftAfterSave struct{}
				249
				250	if !raft.IsEmptySnap(rd.Snapshot) {
				251	// Force WAL to fsync its hard state before Release() releases
				252	// old data from the WAL. Otherwise could get an error like:
				253	// panic: tocommit(107) is out of range [lastIndex(84)]. Was the raft log corrupted, truncated, or lost?
				254	// See https://github.com/etcd-io/etcd/issues/10219 for more details.
				255	if err := r.storage.Sync(); err != nil {
				256	plog.Fatalf("failed to sync Raft snapshot %v", err)
				257	}
				258
				259	// etcdserver now claim the snapshot has been persisted onto the disk
				260	notifyc <- struct{}{}
				261
				262	// gofail: var raftAfterSaveSnap struct{}
				263	r.raftStorage.ApplySnapshot(rd.Snapshot)
				264	plog.Infof("raft applied incoming snapshot at index %d", rd.Snapshot.Metadata.Index)
				265	// gofail: var raftAfterApplySnap struct{}
				266
				267	if err := r.storage.Release(rd.Snapshot); err != nil {
				268	plog.Fatalf("failed to release Raft wal %v", err)
				269	}
				270	}
				271
				272	r.raftStorage.Append(rd.Entries)
				273
				274	if !islead {
				275	// finish processing incoming messages before we signal raftdone chan
				276	msgs := r.processMessages(rd.Messages)
				277
				278	// now unblocks 'applyAll' that waits on Raft log disk writes before triggering snapshots
				279	notifyc <- struct{}{}
				280
				281	// Candidate or follower needs to wait for all pending configuration
				282	// changes to be applied before sending messages.
				283	// Otherwise we might incorrectly count votes (e.g. votes from removed members).
				284	// Also slow machine's follower raft-layer could proceed to become the leader
				285	// on its own single-node cluster, before apply-layer applies the config change.
				286	// We simply wait for ALL pending entries to be applied for now.
				287	// We might improve this later on if it causes unnecessary long blocking issues.
				288	waitApply := false
				289	for _, ent := range rd.CommittedEntries {
				290	if ent.Type == raftpb.EntryConfChange {
				291	waitApply = true
				292	break
				293	}
				294	}
				295	if waitApply {
				296	// blocks until 'applyAll' calls 'applyWait.Trigger'
				297	// to be in sync with scheduled config-change job
				298	// (assume notifyc has cap of 1)
				299	select {
				300	case notifyc <- struct{}{}:
				301	case <-r.stopped:
				302	return
				303	}
				304	}
				305
				306	// gofail: var raftBeforeFollowerSend struct{}
				307	r.transport.Send(msgs)
				308	} else {
				309	// leader already processed 'MsgSnap' and signaled
				310	notifyc <- struct{}{}
				311	}
				312
				313	r.Advance()
				314	case <-r.stopped:
				315	return
				316	}
				317	}
				318	}()
				319	}
				320
				321	func updateCommittedIndex(ap apply, rh raftReadyHandler) {
				322	var ci uint64
				323	if len(ap.entries) != 0 {
				324	ci = ap.entries[len(ap.entries)-1].Index
				325	}
				326	if ap.snapshot.Metadata.Index > ci {
				327	ci = ap.snapshot.Metadata.Index
				328	}
				329	if ci != 0 {
				330	rh.updateCommittedIndex(ci)
				331	}
				332	}
				333
				334	func (r *raftNode) processMessages(ms []raftpb.Message) []raftpb.Message {
				335	sentAppResp := false
				336	for i := len(ms) - 1; i >= 0; i-- {
				337	if r.isIDRemoved(ms[i].To) {
				338	ms[i].To = 0
				339	}
				340
				341	if ms[i].Type == raftpb.MsgAppResp {
				342	if sentAppResp {
				343	ms[i].To = 0
				344	} else {
				345	sentAppResp = true
				346	}
				347	}
				348
				349	if ms[i].Type == raftpb.MsgSnap {
				350	// There are two separate data store: the store for v2, and the KV for v3.
				351	// The msgSnap only contains the most recent snapshot of store without KV.
				352	// So we need to redirect the msgSnap to etcd server main loop for merging in the
				353	// current store snapshot and KV snapshot.
				354	select {
				355	case r.msgSnapC <- ms[i]:
				356	default:
				357	// drop msgSnap if the inflight chan if full.
				358	}
				359	ms[i].To = 0
				360	}
				361	if ms[i].Type == raftpb.MsgHeartbeat {
				362	ok, exceed := r.td.Observe(ms[i].To)
				363	if !ok {
				364	// TODO: limit request rate.
				365	plog.Warningf("failed to send out heartbeat on time (exceeded the %v timeout for %v, to %x)", r.heartbeat, exceed, ms[i].To)
				366	plog.Warningf("server is likely overloaded")
				367	heartbeatSendFailures.Inc()
				368	}
				369	}
				370	}
				371	return ms
				372	}
				373
				374	func (r *raftNode) apply() chan apply {
				375	return r.applyc
				376	}
				377
				378	func (r *raftNode) stop() {
				379	r.stopped <- struct{}{}
				380	<-r.done
				381	}
				382
				383	func (r *raftNode) onStop() {
				384	r.Stop()
				385	r.ticker.Stop()
				386	r.transport.Stop()
				387	if err := r.storage.Close(); err != nil {
				388	plog.Panicf("raft close storage error: %v", err)
				389	}
				390	close(r.done)
				391	}
				392
				393	// for testing
				394	func (r *raftNode) pauseSending() {
				395	p := r.transport.(rafthttp.Pausable)
				396	p.Pause()
				397	}
				398
				399	func (r *raftNode) resumeSending() {
				400	p := r.transport.(rafthttp.Pausable)
				401	p.Resume()
				402	}
				403
				404	// advanceTicks advances ticks of Raft node.
				405	// This can be used for fast-forwarding election
				406	// ticks in multi data-center deployments, thus
				407	// speeding up election process.
				408	func (r *raftNode) advanceTicks(ticks int) {
				409	for i := 0; i < ticks; i++ {
				410	r.tick()
				411	}
				412	}
				413
				414	func startNode(cfg ServerConfig, cl membership.RaftCluster, ids []types.ID) (id types.ID, n raft.Node, s raft.MemoryStorage, w *wal.WAL) {
				415	var err error
				416	member := cl.MemberByName(cfg.Name)
				417	metadata := pbutil.MustMarshal(
				418	&pb.Metadata{
				419	NodeID: uint64(member.ID),
				420	ClusterID: uint64(cl.ID()),
				421	},
				422	)
				423	if w, err = wal.Create(cfg.WALDir(), metadata); err != nil {
				424	plog.Panicf("create wal error: %v", err)
				425	}
				426	peers := make([]raft.Peer, len(ids))
				427	for i, id := range ids {
				428	ctx, err := json.Marshal((*cl).Member(id))
				429	if err != nil {
				430	plog.Panicf("marshal member should never fail: %v", err)
				431	}
				432	peers[i] = raft.Peer{ID: uint64(id), Context: ctx}
				433	}
				434	id = member.ID
				435	plog.Infof("starting member %s in cluster %s", id, cl.ID())
				436	s = raft.NewMemoryStorage()
				437	c := &raft.Config{
				438	ID: uint64(id),
				439	ElectionTick: cfg.ElectionTicks,
				440	HeartbeatTick: 1,
				441	Storage: s,
				442	MaxSizePerMsg: maxSizePerMsg,
				443	MaxInflightMsgs: maxInflightMsgs,
				444	CheckQuorum: true,
				445	}
				446
				447	n = raft.StartNode(c, peers)
				448	raftStatusMu.Lock()
				449	raftStatus = n.Status
				450	raftStatusMu.Unlock()
				451	return id, n, s, w
				452	}
				453
				454	func restartNode(cfg ServerConfig, snapshot raftpb.Snapshot) (types.ID, membership.RaftCluster, raft.Node, raft.MemoryStorage, wal.WAL) {
				455	var walsnap walpb.Snapshot
				456	if snapshot != nil {
				457	walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term
				458	}
				459	w, id, cid, st, ents := readWAL(cfg.WALDir(), walsnap)
				460
				461	plog.Infof("restarting member %s in cluster %s at commit index %d", id, cid, st.Commit)
				462	cl := membership.NewCluster("")
				463	cl.SetID(cid)
				464	s := raft.NewMemoryStorage()
				465	if snapshot != nil {
				466	s.ApplySnapshot(*snapshot)
				467	}
				468	s.SetHardState(st)
				469	s.Append(ents)
				470	c := &raft.Config{
				471	ID: uint64(id),
				472	ElectionTick: cfg.ElectionTicks,
				473	HeartbeatTick: 1,
				474	Storage: s,
				475	MaxSizePerMsg: maxSizePerMsg,
				476	MaxInflightMsgs: maxInflightMsgs,
				477	CheckQuorum: true,
				478	}
				479
				480	n := raft.RestartNode(c)
				481	raftStatusMu.Lock()
				482	raftStatus = n.Status
				483	raftStatusMu.Unlock()
				484	return id, cl, n, s, w
				485	}
				486
				487	func restartAsStandaloneNode(cfg ServerConfig, snapshot raftpb.Snapshot) (types.ID, membership.RaftCluster, raft.Node, raft.MemoryStorage, wal.WAL) {
				488	var walsnap walpb.Snapshot
				489	if snapshot != nil {
				490	walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term
				491	}
				492	w, id, cid, st, ents := readWAL(cfg.WALDir(), walsnap)
				493
				494	// discard the previously uncommitted entries
				495	for i, ent := range ents {
				496	if ent.Index > st.Commit {
				497	plog.Infof("discarding %d uncommitted WAL entries ", len(ents)-i)
				498	ents = ents[:i]
				499	break
				500	}
				501	}
				502
				503	// force append the configuration change entries
				504	toAppEnts := createConfigChangeEnts(getIDs(snapshot, ents), uint64(id), st.Term, st.Commit)
				505	ents = append(ents, toAppEnts...)
				506
				507	// force commit newly appended entries
				508	err := w.Save(raftpb.HardState{}, toAppEnts)
				509	if err != nil {
				510	plog.Fatalf("%v", err)
				511	}
				512	if len(ents) != 0 {
				513	st.Commit = ents[len(ents)-1].Index
				514	}
				515
				516	plog.Printf("forcing restart of member %s in cluster %s at commit index %d", id, cid, st.Commit)
				517	cl := membership.NewCluster("")
				518	cl.SetID(cid)
				519	s := raft.NewMemoryStorage()
				520	if snapshot != nil {
				521	s.ApplySnapshot(*snapshot)
				522	}
				523	s.SetHardState(st)
				524	s.Append(ents)
				525	c := &raft.Config{
				526	ID: uint64(id),
				527	ElectionTick: cfg.ElectionTicks,
				528	HeartbeatTick: 1,
				529	Storage: s,
				530	MaxSizePerMsg: maxSizePerMsg,
				531	MaxInflightMsgs: maxInflightMsgs,
				532	CheckQuorum: true,
				533	}
				534	n := raft.RestartNode(c)
				535	raftStatus = n.Status
				536	return id, cl, n, s, w
				537	}
				538
				539	// getIDs returns an ordered set of IDs included in the given snapshot and
				540	// the entries. The given snapshot/entries can contain two kinds of
				541	// ID-related entry:
				542	// - ConfChangeAddNode, in which case the contained ID will be added into the set.
				543	// - ConfChangeRemoveNode, in which case the contained ID will be removed from the set.
				544	func getIDs(snap *raftpb.Snapshot, ents []raftpb.Entry) []uint64 {
				545	ids := make(map[uint64]bool)
				546	if snap != nil {
				547	for _, id := range snap.Metadata.ConfState.Nodes {
				548	ids[id] = true
				549	}
				550	}
				551	for _, e := range ents {
				552	if e.Type != raftpb.EntryConfChange {
				553	continue
				554	}
				555	var cc raftpb.ConfChange
				556	pbutil.MustUnmarshal(&cc, e.Data)
				557	switch cc.Type {
				558	case raftpb.ConfChangeAddNode:
				559	ids[cc.NodeID] = true
				560	case raftpb.ConfChangeRemoveNode:
				561	delete(ids, cc.NodeID)
				562	case raftpb.ConfChangeUpdateNode:
				563	// do nothing
				564	default:
				565	plog.Panicf("ConfChange Type should be either ConfChangeAddNode or ConfChangeRemoveNode!")
				566	}
				567	}
				568	sids := make(types.Uint64Slice, 0, len(ids))
				569	for id := range ids {
				570	sids = append(sids, id)
				571	}
				572	sort.Sort(sids)
				573	return []uint64(sids)
				574	}
				575
				576	// createConfigChangeEnts creates a series of Raft entries (i.e.
				577	// EntryConfChange) to remove the set of given IDs from the cluster. The ID
				578	// `self` is _not_ removed, even if present in the set.
				579	// If `self` is not inside the given ids, it creates a Raft entry to add a
				580	// default member with the given `self`.
				581	func createConfigChangeEnts(ids []uint64, self uint64, term, index uint64) []raftpb.Entry {
				582	ents := make([]raftpb.Entry, 0)
				583	next := index + 1
				584	found := false
				585	for _, id := range ids {
				586	if id == self {
				587	found = true
				588	continue
				589	}
				590	cc := &raftpb.ConfChange{
				591	Type: raftpb.ConfChangeRemoveNode,
				592	NodeID: id,
				593	}
				594	e := raftpb.Entry{
				595	Type: raftpb.EntryConfChange,
				596	Data: pbutil.MustMarshal(cc),
				597	Term: term,
				598	Index: next,
				599	}
				600	ents = append(ents, e)
				601	next++
				602	}
				603	if !found {
				604	m := membership.Member{
				605	ID: types.ID(self),
				606	RaftAttributes: membership.RaftAttributes{PeerURLs: []string{"http://localhost:2380"}},
				607	}
				608	ctx, err := json.Marshal(m)
				609	if err != nil {
				610	plog.Panicf("marshal member should never fail: %v", err)
				611	}
				612	cc := &raftpb.ConfChange{
				613	Type: raftpb.ConfChangeAddNode,
				614	NodeID: self,
				615	Context: ctx,
				616	}
				617	e := raftpb.Entry{
				618	Type: raftpb.EntryConfChange,
				619	Data: pbutil.MustMarshal(cc),
				620	Term: term,
				621	Index: next,
				622	}
				623	ents = append(ents, e)
				624	}
				625	return ents
				626	}