blob: 212dccbdcc8af393d279725dd8fb3e7799727c7f [file] [log] [blame]
khenaidood948f772021-08-11 17:49:24 -04001// Copyright 2015 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package etcdserver
16
17import (
18 "encoding/json"
19 "expvar"
20 "sort"
21 "sync"
22 "sync/atomic"
23 "time"
24
25 pb "github.com/coreos/etcd/etcdserver/etcdserverpb"
26 "github.com/coreos/etcd/etcdserver/membership"
27 "github.com/coreos/etcd/pkg/contention"
28 "github.com/coreos/etcd/pkg/pbutil"
29 "github.com/coreos/etcd/pkg/types"
30 "github.com/coreos/etcd/raft"
31 "github.com/coreos/etcd/raft/raftpb"
32 "github.com/coreos/etcd/rafthttp"
33 "github.com/coreos/etcd/wal"
34 "github.com/coreos/etcd/wal/walpb"
35 "github.com/coreos/pkg/capnslog"
36)
37
38const (
39 // Number of entries for slow follower to catch-up after compacting
40 // the raft storage entries.
41 // We expect the follower has a millisecond level latency with the leader.
42 // The max throughput is around 10K. Keep a 5K entries is enough for helping
43 // follower to catch up.
44 numberOfCatchUpEntries = 5000
45
46 // The max throughput of etcd will not exceed 100MB/s (100K * 1KB value).
47 // Assuming the RTT is around 10ms, 1MB max size is large enough.
48 maxSizePerMsg = 1 * 1024 * 1024
49 // Never overflow the rafthttp buffer, which is 4096.
50 // TODO: a better const?
51 maxInflightMsgs = 4096 / 8
52)
53
54var (
55 // protects raftStatus
56 raftStatusMu sync.Mutex
57 // indirection for expvar func interface
58 // expvar panics when publishing duplicate name
59 // expvar does not support remove a registered name
60 // so only register a func that calls raftStatus
61 // and change raftStatus as we need.
62 raftStatus func() raft.Status
63)
64
65func init() {
66 raft.SetLogger(capnslog.NewPackageLogger("github.com/coreos/etcd", "raft"))
67 expvar.Publish("raft.status", expvar.Func(func() interface{} {
68 raftStatusMu.Lock()
69 defer raftStatusMu.Unlock()
70 return raftStatus()
71 }))
72}
73
74type RaftTimer interface {
75 Index() uint64
76 Term() uint64
77}
78
79// apply contains entries, snapshot to be applied. Once
80// an apply is consumed, the entries will be persisted to
81// to raft storage concurrently; the application must read
82// raftDone before assuming the raft messages are stable.
83type apply struct {
84 entries []raftpb.Entry
85 snapshot raftpb.Snapshot
86 // notifyc synchronizes etcd server applies with the raft node
87 notifyc chan struct{}
88}
89
90type raftNode struct {
91 // Cache of the latest raft index and raft term the server has seen.
92 // These three unit64 fields must be the first elements to keep 64-bit
93 // alignment for atomic access to the fields.
94 index uint64
95 term uint64
96 lead uint64
97
98 tickMu *sync.Mutex
99 raftNodeConfig
100
101 // a chan to send/receive snapshot
102 msgSnapC chan raftpb.Message
103
104 // a chan to send out apply
105 applyc chan apply
106
107 // a chan to send out readState
108 readStateC chan raft.ReadState
109
110 // utility
111 ticker *time.Ticker
112 // contention detectors for raft heartbeat message
113 td *contention.TimeoutDetector
114
115 stopped chan struct{}
116 done chan struct{}
117}
118
119type raftNodeConfig struct {
120 // to check if msg receiver is removed from cluster
121 isIDRemoved func(id uint64) bool
122 raft.Node
123 raftStorage *raft.MemoryStorage
124 storage Storage
125 heartbeat time.Duration // for logging
126 // transport specifies the transport to send and receive msgs to members.
127 // Sending messages MUST NOT block. It is okay to drop messages, since
128 // clients should timeout and reissue their messages.
129 // If transport is nil, server will panic.
130 transport rafthttp.Transporter
131}
132
133func newRaftNode(cfg raftNodeConfig) *raftNode {
134 r := &raftNode{
135 tickMu: new(sync.Mutex),
136 raftNodeConfig: cfg,
137 // set up contention detectors for raft heartbeat message.
138 // expect to send a heartbeat within 2 heartbeat intervals.
139 td: contention.NewTimeoutDetector(2 * cfg.heartbeat),
140 readStateC: make(chan raft.ReadState, 1),
141 msgSnapC: make(chan raftpb.Message, maxInFlightMsgSnap),
142 applyc: make(chan apply),
143 stopped: make(chan struct{}),
144 done: make(chan struct{}),
145 }
146 if r.heartbeat == 0 {
147 r.ticker = &time.Ticker{}
148 } else {
149 r.ticker = time.NewTicker(r.heartbeat)
150 }
151 return r
152}
153
154// raft.Node does not have locks in Raft package
155func (r *raftNode) tick() {
156 r.tickMu.Lock()
157 r.Tick()
158 r.tickMu.Unlock()
159}
160
161// start prepares and starts raftNode in a new goroutine. It is no longer safe
162// to modify the fields after it has been started.
163func (r *raftNode) start(rh *raftReadyHandler) {
164 internalTimeout := time.Second
165
166 go func() {
167 defer r.onStop()
168 islead := false
169
170 for {
171 select {
172 case <-r.ticker.C:
173 r.tick()
174 case rd := <-r.Ready():
175 if rd.SoftState != nil {
176 newLeader := rd.SoftState.Lead != raft.None && atomic.LoadUint64(&r.lead) != rd.SoftState.Lead
177 if newLeader {
178 leaderChanges.Inc()
179 }
180
181 if rd.SoftState.Lead == raft.None {
182 hasLeader.Set(0)
183 } else {
184 hasLeader.Set(1)
185 }
186
187 atomic.StoreUint64(&r.lead, rd.SoftState.Lead)
188 islead = rd.RaftState == raft.StateLeader
189 if islead {
190 isLeader.Set(1)
191 } else {
192 isLeader.Set(0)
193 }
194 rh.updateLeadership(newLeader)
195 r.td.Reset()
196 }
197
198 if len(rd.ReadStates) != 0 {
199 select {
200 case r.readStateC <- rd.ReadStates[len(rd.ReadStates)-1]:
201 case <-time.After(internalTimeout):
202 plog.Warningf("timed out sending read state")
203 case <-r.stopped:
204 return
205 }
206 }
207
208 notifyc := make(chan struct{}, 1)
209 ap := apply{
210 entries: rd.CommittedEntries,
211 snapshot: rd.Snapshot,
212 notifyc: notifyc,
213 }
214
215 updateCommittedIndex(&ap, rh)
216
217 select {
218 case r.applyc <- ap:
219 case <-r.stopped:
220 return
221 }
222
223 // the leader can write to its disk in parallel with replicating to the followers and them
224 // writing to their disks.
225 // For more details, check raft thesis 10.2.1
226 if islead {
227 // gofail: var raftBeforeLeaderSend struct{}
228 r.transport.Send(r.processMessages(rd.Messages))
229 }
230
231 // Must save the snapshot file and WAL snapshot entry before saving any other entries or hardstate to
232 // ensure that recovery after a snapshot restore is possible.
233 if !raft.IsEmptySnap(rd.Snapshot) {
234 // gofail: var raftBeforeSaveSnap struct{}
235 if err := r.storage.SaveSnap(rd.Snapshot); err != nil {
236 plog.Fatalf("failed to save Raft snapshot %v", err)
237 }
238 // gofail: var raftAfterSaveSnap struct{}
239 }
240
241 // gofail: var raftBeforeSave struct{}
242 if err := r.storage.Save(rd.HardState, rd.Entries); err != nil {
243 plog.Fatalf("failed to raft save state and entries %v", err)
244 }
245 if !raft.IsEmptyHardState(rd.HardState) {
246 proposalsCommitted.Set(float64(rd.HardState.Commit))
247 }
248 // gofail: var raftAfterSave struct{}
249
250 if !raft.IsEmptySnap(rd.Snapshot) {
251 // Force WAL to fsync its hard state before Release() releases
252 // old data from the WAL. Otherwise could get an error like:
253 // panic: tocommit(107) is out of range [lastIndex(84)]. Was the raft log corrupted, truncated, or lost?
254 // See https://github.com/etcd-io/etcd/issues/10219 for more details.
255 if err := r.storage.Sync(); err != nil {
256 plog.Fatalf("failed to sync Raft snapshot %v", err)
257 }
258
259 // etcdserver now claim the snapshot has been persisted onto the disk
260 notifyc <- struct{}{}
261
262 // gofail: var raftAfterSaveSnap struct{}
263 r.raftStorage.ApplySnapshot(rd.Snapshot)
264 plog.Infof("raft applied incoming snapshot at index %d", rd.Snapshot.Metadata.Index)
265 // gofail: var raftAfterApplySnap struct{}
266
267 if err := r.storage.Release(rd.Snapshot); err != nil {
268 plog.Fatalf("failed to release Raft wal %v", err)
269 }
270 }
271
272 r.raftStorage.Append(rd.Entries)
273
274 if !islead {
275 // finish processing incoming messages before we signal raftdone chan
276 msgs := r.processMessages(rd.Messages)
277
278 // now unblocks 'applyAll' that waits on Raft log disk writes before triggering snapshots
279 notifyc <- struct{}{}
280
281 // Candidate or follower needs to wait for all pending configuration
282 // changes to be applied before sending messages.
283 // Otherwise we might incorrectly count votes (e.g. votes from removed members).
284 // Also slow machine's follower raft-layer could proceed to become the leader
285 // on its own single-node cluster, before apply-layer applies the config change.
286 // We simply wait for ALL pending entries to be applied for now.
287 // We might improve this later on if it causes unnecessary long blocking issues.
288 waitApply := false
289 for _, ent := range rd.CommittedEntries {
290 if ent.Type == raftpb.EntryConfChange {
291 waitApply = true
292 break
293 }
294 }
295 if waitApply {
296 // blocks until 'applyAll' calls 'applyWait.Trigger'
297 // to be in sync with scheduled config-change job
298 // (assume notifyc has cap of 1)
299 select {
300 case notifyc <- struct{}{}:
301 case <-r.stopped:
302 return
303 }
304 }
305
306 // gofail: var raftBeforeFollowerSend struct{}
307 r.transport.Send(msgs)
308 } else {
309 // leader already processed 'MsgSnap' and signaled
310 notifyc <- struct{}{}
311 }
312
313 r.Advance()
314 case <-r.stopped:
315 return
316 }
317 }
318 }()
319}
320
321func updateCommittedIndex(ap *apply, rh *raftReadyHandler) {
322 var ci uint64
323 if len(ap.entries) != 0 {
324 ci = ap.entries[len(ap.entries)-1].Index
325 }
326 if ap.snapshot.Metadata.Index > ci {
327 ci = ap.snapshot.Metadata.Index
328 }
329 if ci != 0 {
330 rh.updateCommittedIndex(ci)
331 }
332}
333
334func (r *raftNode) processMessages(ms []raftpb.Message) []raftpb.Message {
335 sentAppResp := false
336 for i := len(ms) - 1; i >= 0; i-- {
337 if r.isIDRemoved(ms[i].To) {
338 ms[i].To = 0
339 }
340
341 if ms[i].Type == raftpb.MsgAppResp {
342 if sentAppResp {
343 ms[i].To = 0
344 } else {
345 sentAppResp = true
346 }
347 }
348
349 if ms[i].Type == raftpb.MsgSnap {
350 // There are two separate data store: the store for v2, and the KV for v3.
351 // The msgSnap only contains the most recent snapshot of store without KV.
352 // So we need to redirect the msgSnap to etcd server main loop for merging in the
353 // current store snapshot and KV snapshot.
354 select {
355 case r.msgSnapC <- ms[i]:
356 default:
357 // drop msgSnap if the inflight chan if full.
358 }
359 ms[i].To = 0
360 }
361 if ms[i].Type == raftpb.MsgHeartbeat {
362 ok, exceed := r.td.Observe(ms[i].To)
363 if !ok {
364 // TODO: limit request rate.
365 plog.Warningf("failed to send out heartbeat on time (exceeded the %v timeout for %v, to %x)", r.heartbeat, exceed, ms[i].To)
366 plog.Warningf("server is likely overloaded")
367 heartbeatSendFailures.Inc()
368 }
369 }
370 }
371 return ms
372}
373
374func (r *raftNode) apply() chan apply {
375 return r.applyc
376}
377
378func (r *raftNode) stop() {
379 r.stopped <- struct{}{}
380 <-r.done
381}
382
383func (r *raftNode) onStop() {
384 r.Stop()
385 r.ticker.Stop()
386 r.transport.Stop()
387 if err := r.storage.Close(); err != nil {
388 plog.Panicf("raft close storage error: %v", err)
389 }
390 close(r.done)
391}
392
393// for testing
394func (r *raftNode) pauseSending() {
395 p := r.transport.(rafthttp.Pausable)
396 p.Pause()
397}
398
399func (r *raftNode) resumeSending() {
400 p := r.transport.(rafthttp.Pausable)
401 p.Resume()
402}
403
404// advanceTicks advances ticks of Raft node.
405// This can be used for fast-forwarding election
406// ticks in multi data-center deployments, thus
407// speeding up election process.
408func (r *raftNode) advanceTicks(ticks int) {
409 for i := 0; i < ticks; i++ {
410 r.tick()
411 }
412}
413
414func startNode(cfg ServerConfig, cl *membership.RaftCluster, ids []types.ID) (id types.ID, n raft.Node, s *raft.MemoryStorage, w *wal.WAL) {
415 var err error
416 member := cl.MemberByName(cfg.Name)
417 metadata := pbutil.MustMarshal(
418 &pb.Metadata{
419 NodeID: uint64(member.ID),
420 ClusterID: uint64(cl.ID()),
421 },
422 )
423 if w, err = wal.Create(cfg.WALDir(), metadata); err != nil {
424 plog.Panicf("create wal error: %v", err)
425 }
426 peers := make([]raft.Peer, len(ids))
427 for i, id := range ids {
428 ctx, err := json.Marshal((*cl).Member(id))
429 if err != nil {
430 plog.Panicf("marshal member should never fail: %v", err)
431 }
432 peers[i] = raft.Peer{ID: uint64(id), Context: ctx}
433 }
434 id = member.ID
435 plog.Infof("starting member %s in cluster %s", id, cl.ID())
436 s = raft.NewMemoryStorage()
437 c := &raft.Config{
438 ID: uint64(id),
439 ElectionTick: cfg.ElectionTicks,
440 HeartbeatTick: 1,
441 Storage: s,
442 MaxSizePerMsg: maxSizePerMsg,
443 MaxInflightMsgs: maxInflightMsgs,
444 CheckQuorum: true,
445 }
446
447 n = raft.StartNode(c, peers)
448 raftStatusMu.Lock()
449 raftStatus = n.Status
450 raftStatusMu.Unlock()
451 return id, n, s, w
452}
453
454func restartNode(cfg ServerConfig, snapshot *raftpb.Snapshot) (types.ID, *membership.RaftCluster, raft.Node, *raft.MemoryStorage, *wal.WAL) {
455 var walsnap walpb.Snapshot
456 if snapshot != nil {
457 walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term
458 }
459 w, id, cid, st, ents := readWAL(cfg.WALDir(), walsnap)
460
461 plog.Infof("restarting member %s in cluster %s at commit index %d", id, cid, st.Commit)
462 cl := membership.NewCluster("")
463 cl.SetID(cid)
464 s := raft.NewMemoryStorage()
465 if snapshot != nil {
466 s.ApplySnapshot(*snapshot)
467 }
468 s.SetHardState(st)
469 s.Append(ents)
470 c := &raft.Config{
471 ID: uint64(id),
472 ElectionTick: cfg.ElectionTicks,
473 HeartbeatTick: 1,
474 Storage: s,
475 MaxSizePerMsg: maxSizePerMsg,
476 MaxInflightMsgs: maxInflightMsgs,
477 CheckQuorum: true,
478 }
479
480 n := raft.RestartNode(c)
481 raftStatusMu.Lock()
482 raftStatus = n.Status
483 raftStatusMu.Unlock()
484 return id, cl, n, s, w
485}
486
487func restartAsStandaloneNode(cfg ServerConfig, snapshot *raftpb.Snapshot) (types.ID, *membership.RaftCluster, raft.Node, *raft.MemoryStorage, *wal.WAL) {
488 var walsnap walpb.Snapshot
489 if snapshot != nil {
490 walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term
491 }
492 w, id, cid, st, ents := readWAL(cfg.WALDir(), walsnap)
493
494 // discard the previously uncommitted entries
495 for i, ent := range ents {
496 if ent.Index > st.Commit {
497 plog.Infof("discarding %d uncommitted WAL entries ", len(ents)-i)
498 ents = ents[:i]
499 break
500 }
501 }
502
503 // force append the configuration change entries
504 toAppEnts := createConfigChangeEnts(getIDs(snapshot, ents), uint64(id), st.Term, st.Commit)
505 ents = append(ents, toAppEnts...)
506
507 // force commit newly appended entries
508 err := w.Save(raftpb.HardState{}, toAppEnts)
509 if err != nil {
510 plog.Fatalf("%v", err)
511 }
512 if len(ents) != 0 {
513 st.Commit = ents[len(ents)-1].Index
514 }
515
516 plog.Printf("forcing restart of member %s in cluster %s at commit index %d", id, cid, st.Commit)
517 cl := membership.NewCluster("")
518 cl.SetID(cid)
519 s := raft.NewMemoryStorage()
520 if snapshot != nil {
521 s.ApplySnapshot(*snapshot)
522 }
523 s.SetHardState(st)
524 s.Append(ents)
525 c := &raft.Config{
526 ID: uint64(id),
527 ElectionTick: cfg.ElectionTicks,
528 HeartbeatTick: 1,
529 Storage: s,
530 MaxSizePerMsg: maxSizePerMsg,
531 MaxInflightMsgs: maxInflightMsgs,
532 CheckQuorum: true,
533 }
534 n := raft.RestartNode(c)
535 raftStatus = n.Status
536 return id, cl, n, s, w
537}
538
539// getIDs returns an ordered set of IDs included in the given snapshot and
540// the entries. The given snapshot/entries can contain two kinds of
541// ID-related entry:
542// - ConfChangeAddNode, in which case the contained ID will be added into the set.
543// - ConfChangeRemoveNode, in which case the contained ID will be removed from the set.
544func getIDs(snap *raftpb.Snapshot, ents []raftpb.Entry) []uint64 {
545 ids := make(map[uint64]bool)
546 if snap != nil {
547 for _, id := range snap.Metadata.ConfState.Nodes {
548 ids[id] = true
549 }
550 }
551 for _, e := range ents {
552 if e.Type != raftpb.EntryConfChange {
553 continue
554 }
555 var cc raftpb.ConfChange
556 pbutil.MustUnmarshal(&cc, e.Data)
557 switch cc.Type {
558 case raftpb.ConfChangeAddNode:
559 ids[cc.NodeID] = true
560 case raftpb.ConfChangeRemoveNode:
561 delete(ids, cc.NodeID)
562 case raftpb.ConfChangeUpdateNode:
563 // do nothing
564 default:
565 plog.Panicf("ConfChange Type should be either ConfChangeAddNode or ConfChangeRemoveNode!")
566 }
567 }
568 sids := make(types.Uint64Slice, 0, len(ids))
569 for id := range ids {
570 sids = append(sids, id)
571 }
572 sort.Sort(sids)
573 return []uint64(sids)
574}
575
576// createConfigChangeEnts creates a series of Raft entries (i.e.
577// EntryConfChange) to remove the set of given IDs from the cluster. The ID
578// `self` is _not_ removed, even if present in the set.
579// If `self` is not inside the given ids, it creates a Raft entry to add a
580// default member with the given `self`.
581func createConfigChangeEnts(ids []uint64, self uint64, term, index uint64) []raftpb.Entry {
582 ents := make([]raftpb.Entry, 0)
583 next := index + 1
584 found := false
585 for _, id := range ids {
586 if id == self {
587 found = true
588 continue
589 }
590 cc := &raftpb.ConfChange{
591 Type: raftpb.ConfChangeRemoveNode,
592 NodeID: id,
593 }
594 e := raftpb.Entry{
595 Type: raftpb.EntryConfChange,
596 Data: pbutil.MustMarshal(cc),
597 Term: term,
598 Index: next,
599 }
600 ents = append(ents, e)
601 next++
602 }
603 if !found {
604 m := membership.Member{
605 ID: types.ID(self),
606 RaftAttributes: membership.RaftAttributes{PeerURLs: []string{"http://localhost:2380"}},
607 }
608 ctx, err := json.Marshal(m)
609 if err != nil {
610 plog.Panicf("marshal member should never fail: %v", err)
611 }
612 cc := &raftpb.ConfChange{
613 Type: raftpb.ConfChangeAddNode,
614 NodeID: self,
615 Context: ctx,
616 }
617 e := raftpb.Entry{
618 Type: raftpb.EntryConfChange,
619 Data: pbutil.MustMarshal(cc),
620 Term: term,
621 Index: next,
622 }
623 ents = append(ents, e)
624 }
625 return ents
626}