blob: 1080633b18bbb723d71f66b21c55d81f88e805e2 [file] [log] [blame]
khenaidooffe076b2019-01-15 16:08:08 -05001// Copyright 2015 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package etcdserver
16
17import (
18 "encoding/json"
19 "expvar"
20 "sort"
21 "sync"
22 "sync/atomic"
23 "time"
24
25 pb "github.com/coreos/etcd/etcdserver/etcdserverpb"
26 "github.com/coreos/etcd/etcdserver/membership"
27 "github.com/coreos/etcd/pkg/contention"
28 "github.com/coreos/etcd/pkg/pbutil"
29 "github.com/coreos/etcd/pkg/types"
30 "github.com/coreos/etcd/raft"
31 "github.com/coreos/etcd/raft/raftpb"
32 "github.com/coreos/etcd/rafthttp"
33 "github.com/coreos/etcd/wal"
34 "github.com/coreos/etcd/wal/walpb"
35 "github.com/coreos/pkg/capnslog"
36)
37
38const (
39 // Number of entries for slow follower to catch-up after compacting
40 // the raft storage entries.
41 // We expect the follower has a millisecond level latency with the leader.
42 // The max throughput is around 10K. Keep a 5K entries is enough for helping
43 // follower to catch up.
44 numberOfCatchUpEntries = 5000
45
46 // The max throughput of etcd will not exceed 100MB/s (100K * 1KB value).
47 // Assuming the RTT is around 10ms, 1MB max size is large enough.
48 maxSizePerMsg = 1 * 1024 * 1024
49 // Never overflow the rafthttp buffer, which is 4096.
50 // TODO: a better const?
51 maxInflightMsgs = 4096 / 8
52)
53
54var (
55 // protects raftStatus
56 raftStatusMu sync.Mutex
57 // indirection for expvar func interface
58 // expvar panics when publishing duplicate name
59 // expvar does not support remove a registered name
60 // so only register a func that calls raftStatus
61 // and change raftStatus as we need.
62 raftStatus func() raft.Status
63)
64
65func init() {
66 raft.SetLogger(capnslog.NewPackageLogger("github.com/coreos/etcd", "raft"))
67 expvar.Publish("raft.status", expvar.Func(func() interface{} {
68 raftStatusMu.Lock()
69 defer raftStatusMu.Unlock()
70 return raftStatus()
71 }))
72}
73
74type RaftTimer interface {
75 Index() uint64
76 Term() uint64
77}
78
79// apply contains entries, snapshot to be applied. Once
80// an apply is consumed, the entries will be persisted to
81// to raft storage concurrently; the application must read
82// raftDone before assuming the raft messages are stable.
83type apply struct {
84 entries []raftpb.Entry
85 snapshot raftpb.Snapshot
86 // notifyc synchronizes etcd server applies with the raft node
87 notifyc chan struct{}
88}
89
90type raftNode struct {
91 // Cache of the latest raft index and raft term the server has seen.
92 // These three unit64 fields must be the first elements to keep 64-bit
93 // alignment for atomic access to the fields.
94 index uint64
95 term uint64
96 lead uint64
97
98 tickMu *sync.Mutex
99 raftNodeConfig
100
101 // a chan to send/receive snapshot
102 msgSnapC chan raftpb.Message
103
104 // a chan to send out apply
105 applyc chan apply
106
107 // a chan to send out readState
108 readStateC chan raft.ReadState
109
110 // utility
111 ticker *time.Ticker
112 // contention detectors for raft heartbeat message
113 td *contention.TimeoutDetector
114
115 stopped chan struct{}
116 done chan struct{}
117}
118
119type raftNodeConfig struct {
120 // to check if msg receiver is removed from cluster
121 isIDRemoved func(id uint64) bool
122 raft.Node
123 raftStorage *raft.MemoryStorage
124 storage Storage
125 heartbeat time.Duration // for logging
126 // transport specifies the transport to send and receive msgs to members.
127 // Sending messages MUST NOT block. It is okay to drop messages, since
128 // clients should timeout and reissue their messages.
129 // If transport is nil, server will panic.
130 transport rafthttp.Transporter
131}
132
133func newRaftNode(cfg raftNodeConfig) *raftNode {
134 r := &raftNode{
135 tickMu: new(sync.Mutex),
136 raftNodeConfig: cfg,
137 // set up contention detectors for raft heartbeat message.
138 // expect to send a heartbeat within 2 heartbeat intervals.
139 td: contention.NewTimeoutDetector(2 * cfg.heartbeat),
140 readStateC: make(chan raft.ReadState, 1),
141 msgSnapC: make(chan raftpb.Message, maxInFlightMsgSnap),
142 applyc: make(chan apply),
143 stopped: make(chan struct{}),
144 done: make(chan struct{}),
145 }
146 if r.heartbeat == 0 {
147 r.ticker = &time.Ticker{}
148 } else {
149 r.ticker = time.NewTicker(r.heartbeat)
150 }
151 return r
152}
153
154// raft.Node does not have locks in Raft package
155func (r *raftNode) tick() {
156 r.tickMu.Lock()
157 r.Tick()
158 r.tickMu.Unlock()
159}
160
161// start prepares and starts raftNode in a new goroutine. It is no longer safe
162// to modify the fields after it has been started.
163func (r *raftNode) start(rh *raftReadyHandler) {
164 internalTimeout := time.Second
165
166 go func() {
167 defer r.onStop()
168 islead := false
169
170 for {
171 select {
172 case <-r.ticker.C:
173 r.tick()
174 case rd := <-r.Ready():
175 if rd.SoftState != nil {
176 newLeader := rd.SoftState.Lead != raft.None && atomic.LoadUint64(&r.lead) != rd.SoftState.Lead
177 if newLeader {
178 leaderChanges.Inc()
179 }
180
181 if rd.SoftState.Lead == raft.None {
182 hasLeader.Set(0)
183 } else {
184 hasLeader.Set(1)
185 }
186
187 atomic.StoreUint64(&r.lead, rd.SoftState.Lead)
188 islead = rd.RaftState == raft.StateLeader
189 if islead {
190 isLeader.Set(1)
191 } else {
192 isLeader.Set(0)
193 }
194 rh.updateLeadership(newLeader)
195 r.td.Reset()
196 }
197
198 if len(rd.ReadStates) != 0 {
199 select {
200 case r.readStateC <- rd.ReadStates[len(rd.ReadStates)-1]:
201 case <-time.After(internalTimeout):
202 plog.Warningf("timed out sending read state")
203 case <-r.stopped:
204 return
205 }
206 }
207
208 notifyc := make(chan struct{}, 1)
209 ap := apply{
210 entries: rd.CommittedEntries,
211 snapshot: rd.Snapshot,
212 notifyc: notifyc,
213 }
214
215 updateCommittedIndex(&ap, rh)
216
217 select {
218 case r.applyc <- ap:
219 case <-r.stopped:
220 return
221 }
222
223 // the leader can write to its disk in parallel with replicating to the followers and them
224 // writing to their disks.
225 // For more details, check raft thesis 10.2.1
226 if islead {
227 // gofail: var raftBeforeLeaderSend struct{}
228 r.transport.Send(r.processMessages(rd.Messages))
229 }
230
231 // gofail: var raftBeforeSave struct{}
232 if err := r.storage.Save(rd.HardState, rd.Entries); err != nil {
233 plog.Fatalf("raft save state and entries error: %v", err)
234 }
235 if !raft.IsEmptyHardState(rd.HardState) {
236 proposalsCommitted.Set(float64(rd.HardState.Commit))
237 }
238 // gofail: var raftAfterSave struct{}
239
240 if !raft.IsEmptySnap(rd.Snapshot) {
241 // gofail: var raftBeforeSaveSnap struct{}
242 if err := r.storage.SaveSnap(rd.Snapshot); err != nil {
243 plog.Fatalf("raft save snapshot error: %v", err)
244 }
245 // etcdserver now claim the snapshot has been persisted onto the disk
246 notifyc <- struct{}{}
247
248 // gofail: var raftAfterSaveSnap struct{}
249 r.raftStorage.ApplySnapshot(rd.Snapshot)
250 plog.Infof("raft applied incoming snapshot at index %d", rd.Snapshot.Metadata.Index)
251 // gofail: var raftAfterApplySnap struct{}
252 }
253
254 r.raftStorage.Append(rd.Entries)
255
256 if !islead {
257 // finish processing incoming messages before we signal raftdone chan
258 msgs := r.processMessages(rd.Messages)
259
260 // now unblocks 'applyAll' that waits on Raft log disk writes before triggering snapshots
261 notifyc <- struct{}{}
262
263 // Candidate or follower needs to wait for all pending configuration
264 // changes to be applied before sending messages.
265 // Otherwise we might incorrectly count votes (e.g. votes from removed members).
266 // Also slow machine's follower raft-layer could proceed to become the leader
267 // on its own single-node cluster, before apply-layer applies the config change.
268 // We simply wait for ALL pending entries to be applied for now.
269 // We might improve this later on if it causes unnecessary long blocking issues.
270 waitApply := false
271 for _, ent := range rd.CommittedEntries {
272 if ent.Type == raftpb.EntryConfChange {
273 waitApply = true
274 break
275 }
276 }
277 if waitApply {
278 // blocks until 'applyAll' calls 'applyWait.Trigger'
279 // to be in sync with scheduled config-change job
280 // (assume notifyc has cap of 1)
281 select {
282 case notifyc <- struct{}{}:
283 case <-r.stopped:
284 return
285 }
286 }
287
288 // gofail: var raftBeforeFollowerSend struct{}
289 r.transport.Send(msgs)
290 } else {
291 // leader already processed 'MsgSnap' and signaled
292 notifyc <- struct{}{}
293 }
294
295 r.Advance()
296 case <-r.stopped:
297 return
298 }
299 }
300 }()
301}
302
303func updateCommittedIndex(ap *apply, rh *raftReadyHandler) {
304 var ci uint64
305 if len(ap.entries) != 0 {
306 ci = ap.entries[len(ap.entries)-1].Index
307 }
308 if ap.snapshot.Metadata.Index > ci {
309 ci = ap.snapshot.Metadata.Index
310 }
311 if ci != 0 {
312 rh.updateCommittedIndex(ci)
313 }
314}
315
316func (r *raftNode) processMessages(ms []raftpb.Message) []raftpb.Message {
317 sentAppResp := false
318 for i := len(ms) - 1; i >= 0; i-- {
319 if r.isIDRemoved(ms[i].To) {
320 ms[i].To = 0
321 }
322
323 if ms[i].Type == raftpb.MsgAppResp {
324 if sentAppResp {
325 ms[i].To = 0
326 } else {
327 sentAppResp = true
328 }
329 }
330
331 if ms[i].Type == raftpb.MsgSnap {
332 // There are two separate data store: the store for v2, and the KV for v3.
333 // The msgSnap only contains the most recent snapshot of store without KV.
334 // So we need to redirect the msgSnap to etcd server main loop for merging in the
335 // current store snapshot and KV snapshot.
336 select {
337 case r.msgSnapC <- ms[i]:
338 default:
339 // drop msgSnap if the inflight chan if full.
340 }
341 ms[i].To = 0
342 }
343 if ms[i].Type == raftpb.MsgHeartbeat {
344 ok, exceed := r.td.Observe(ms[i].To)
345 if !ok {
346 // TODO: limit request rate.
347 plog.Warningf("failed to send out heartbeat on time (exceeded the %v timeout for %v)", r.heartbeat, exceed)
348 plog.Warningf("server is likely overloaded")
349 heartbeatSendFailures.Inc()
350 }
351 }
352 }
353 return ms
354}
355
356func (r *raftNode) apply() chan apply {
357 return r.applyc
358}
359
360func (r *raftNode) stop() {
361 r.stopped <- struct{}{}
362 <-r.done
363}
364
365func (r *raftNode) onStop() {
366 r.Stop()
367 r.ticker.Stop()
368 r.transport.Stop()
369 if err := r.storage.Close(); err != nil {
370 plog.Panicf("raft close storage error: %v", err)
371 }
372 close(r.done)
373}
374
375// for testing
376func (r *raftNode) pauseSending() {
377 p := r.transport.(rafthttp.Pausable)
378 p.Pause()
379}
380
381func (r *raftNode) resumeSending() {
382 p := r.transport.(rafthttp.Pausable)
383 p.Resume()
384}
385
386// advanceTicks advances ticks of Raft node.
387// This can be used for fast-forwarding election
388// ticks in multi data-center deployments, thus
389// speeding up election process.
390func (r *raftNode) advanceTicks(ticks int) {
391 for i := 0; i < ticks; i++ {
392 r.tick()
393 }
394}
395
396func startNode(cfg ServerConfig, cl *membership.RaftCluster, ids []types.ID) (id types.ID, n raft.Node, s *raft.MemoryStorage, w *wal.WAL) {
397 var err error
398 member := cl.MemberByName(cfg.Name)
399 metadata := pbutil.MustMarshal(
400 &pb.Metadata{
401 NodeID: uint64(member.ID),
402 ClusterID: uint64(cl.ID()),
403 },
404 )
405 if w, err = wal.Create(cfg.WALDir(), metadata); err != nil {
406 plog.Fatalf("create wal error: %v", err)
407 }
408 peers := make([]raft.Peer, len(ids))
409 for i, id := range ids {
410 ctx, err := json.Marshal((*cl).Member(id))
411 if err != nil {
412 plog.Panicf("marshal member should never fail: %v", err)
413 }
414 peers[i] = raft.Peer{ID: uint64(id), Context: ctx}
415 }
416 id = member.ID
417 plog.Infof("starting member %s in cluster %s", id, cl.ID())
418 s = raft.NewMemoryStorage()
419 c := &raft.Config{
420 ID: uint64(id),
421 ElectionTick: cfg.ElectionTicks,
422 HeartbeatTick: 1,
423 Storage: s,
424 MaxSizePerMsg: maxSizePerMsg,
425 MaxInflightMsgs: maxInflightMsgs,
426 CheckQuorum: true,
427 }
428
429 n = raft.StartNode(c, peers)
430 raftStatusMu.Lock()
431 raftStatus = n.Status
432 raftStatusMu.Unlock()
433 return id, n, s, w
434}
435
436func restartNode(cfg ServerConfig, snapshot *raftpb.Snapshot) (types.ID, *membership.RaftCluster, raft.Node, *raft.MemoryStorage, *wal.WAL) {
437 var walsnap walpb.Snapshot
438 if snapshot != nil {
439 walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term
440 }
441 w, id, cid, st, ents := readWAL(cfg.WALDir(), walsnap)
442
443 plog.Infof("restarting member %s in cluster %s at commit index %d", id, cid, st.Commit)
444 cl := membership.NewCluster("")
445 cl.SetID(cid)
446 s := raft.NewMemoryStorage()
447 if snapshot != nil {
448 s.ApplySnapshot(*snapshot)
449 }
450 s.SetHardState(st)
451 s.Append(ents)
452 c := &raft.Config{
453 ID: uint64(id),
454 ElectionTick: cfg.ElectionTicks,
455 HeartbeatTick: 1,
456 Storage: s,
457 MaxSizePerMsg: maxSizePerMsg,
458 MaxInflightMsgs: maxInflightMsgs,
459 CheckQuorum: true,
460 }
461
462 n := raft.RestartNode(c)
463 raftStatusMu.Lock()
464 raftStatus = n.Status
465 raftStatusMu.Unlock()
466 return id, cl, n, s, w
467}
468
469func restartAsStandaloneNode(cfg ServerConfig, snapshot *raftpb.Snapshot) (types.ID, *membership.RaftCluster, raft.Node, *raft.MemoryStorage, *wal.WAL) {
470 var walsnap walpb.Snapshot
471 if snapshot != nil {
472 walsnap.Index, walsnap.Term = snapshot.Metadata.Index, snapshot.Metadata.Term
473 }
474 w, id, cid, st, ents := readWAL(cfg.WALDir(), walsnap)
475
476 // discard the previously uncommitted entries
477 for i, ent := range ents {
478 if ent.Index > st.Commit {
479 plog.Infof("discarding %d uncommitted WAL entries ", len(ents)-i)
480 ents = ents[:i]
481 break
482 }
483 }
484
485 // force append the configuration change entries
486 toAppEnts := createConfigChangeEnts(getIDs(snapshot, ents), uint64(id), st.Term, st.Commit)
487 ents = append(ents, toAppEnts...)
488
489 // force commit newly appended entries
490 err := w.Save(raftpb.HardState{}, toAppEnts)
491 if err != nil {
492 plog.Fatalf("%v", err)
493 }
494 if len(ents) != 0 {
495 st.Commit = ents[len(ents)-1].Index
496 }
497
498 plog.Printf("forcing restart of member %s in cluster %s at commit index %d", id, cid, st.Commit)
499 cl := membership.NewCluster("")
500 cl.SetID(cid)
501 s := raft.NewMemoryStorage()
502 if snapshot != nil {
503 s.ApplySnapshot(*snapshot)
504 }
505 s.SetHardState(st)
506 s.Append(ents)
507 c := &raft.Config{
508 ID: uint64(id),
509 ElectionTick: cfg.ElectionTicks,
510 HeartbeatTick: 1,
511 Storage: s,
512 MaxSizePerMsg: maxSizePerMsg,
513 MaxInflightMsgs: maxInflightMsgs,
514 CheckQuorum: true,
515 }
516 n := raft.RestartNode(c)
517 raftStatus = n.Status
518 return id, cl, n, s, w
519}
520
521// getIDs returns an ordered set of IDs included in the given snapshot and
522// the entries. The given snapshot/entries can contain two kinds of
523// ID-related entry:
524// - ConfChangeAddNode, in which case the contained ID will be added into the set.
525// - ConfChangeRemoveNode, in which case the contained ID will be removed from the set.
526func getIDs(snap *raftpb.Snapshot, ents []raftpb.Entry) []uint64 {
527 ids := make(map[uint64]bool)
528 if snap != nil {
529 for _, id := range snap.Metadata.ConfState.Nodes {
530 ids[id] = true
531 }
532 }
533 for _, e := range ents {
534 if e.Type != raftpb.EntryConfChange {
535 continue
536 }
537 var cc raftpb.ConfChange
538 pbutil.MustUnmarshal(&cc, e.Data)
539 switch cc.Type {
540 case raftpb.ConfChangeAddNode:
541 ids[cc.NodeID] = true
542 case raftpb.ConfChangeRemoveNode:
543 delete(ids, cc.NodeID)
544 case raftpb.ConfChangeUpdateNode:
545 // do nothing
546 default:
547 plog.Panicf("ConfChange Type should be either ConfChangeAddNode or ConfChangeRemoveNode!")
548 }
549 }
550 sids := make(types.Uint64Slice, 0, len(ids))
551 for id := range ids {
552 sids = append(sids, id)
553 }
554 sort.Sort(sids)
555 return []uint64(sids)
556}
557
558// createConfigChangeEnts creates a series of Raft entries (i.e.
559// EntryConfChange) to remove the set of given IDs from the cluster. The ID
560// `self` is _not_ removed, even if present in the set.
561// If `self` is not inside the given ids, it creates a Raft entry to add a
562// default member with the given `self`.
563func createConfigChangeEnts(ids []uint64, self uint64, term, index uint64) []raftpb.Entry {
564 ents := make([]raftpb.Entry, 0)
565 next := index + 1
566 found := false
567 for _, id := range ids {
568 if id == self {
569 found = true
570 continue
571 }
572 cc := &raftpb.ConfChange{
573 Type: raftpb.ConfChangeRemoveNode,
574 NodeID: id,
575 }
576 e := raftpb.Entry{
577 Type: raftpb.EntryConfChange,
578 Data: pbutil.MustMarshal(cc),
579 Term: term,
580 Index: next,
581 }
582 ents = append(ents, e)
583 next++
584 }
585 if !found {
586 m := membership.Member{
587 ID: types.ID(self),
588 RaftAttributes: membership.RaftAttributes{PeerURLs: []string{"http://localhost:2380"}},
589 }
590 ctx, err := json.Marshal(m)
591 if err != nil {
592 plog.Panicf("marshal member should never fail: %v", err)
593 }
594 cc := &raftpb.ConfChange{
595 Type: raftpb.ConfChangeAddNode,
596 NodeID: self,
597 Context: ctx,
598 }
599 e := raftpb.Entry{
600 Type: raftpb.EntryConfChange,
601 Data: pbutil.MustMarshal(cc),
602 Term: term,
603 Index: next,
604 }
605 ents = append(ents, e)
606 }
607 return ents
608}