blob: ef3787db65d3233a2e3d7dffc0f799f0f6da5275 [file] [log] [blame]
Stephane Barbarie260a5632019-02-26 16:12:49 -05001// Copyright 2015 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package raft
16
17import "fmt"
18
19const (
20 ProgressStateProbe ProgressStateType = iota
21 ProgressStateReplicate
22 ProgressStateSnapshot
23)
24
25type ProgressStateType uint64
26
27var prstmap = [...]string{
28 "ProgressStateProbe",
29 "ProgressStateReplicate",
30 "ProgressStateSnapshot",
31}
32
33func (st ProgressStateType) String() string { return prstmap[uint64(st)] }
34
35// Progress represents a follower’s progress in the view of the leader. Leader maintains
36// progresses of all followers, and sends entries to the follower based on its progress.
37type Progress struct {
38 Match, Next uint64
39 // State defines how the leader should interact with the follower.
40 //
41 // When in ProgressStateProbe, leader sends at most one replication message
42 // per heartbeat interval. It also probes actual progress of the follower.
43 //
44 // When in ProgressStateReplicate, leader optimistically increases next
45 // to the latest entry sent after sending replication message. This is
46 // an optimized state for fast replicating log entries to the follower.
47 //
48 // When in ProgressStateSnapshot, leader should have sent out snapshot
49 // before and stops sending any replication message.
50 State ProgressStateType
51
52 // Paused is used in ProgressStateProbe.
53 // When Paused is true, raft should pause sending replication message to this peer.
54 Paused bool
55 // PendingSnapshot is used in ProgressStateSnapshot.
56 // If there is a pending snapshot, the pendingSnapshot will be set to the
57 // index of the snapshot. If pendingSnapshot is set, the replication process of
58 // this Progress will be paused. raft will not resend snapshot until the pending one
59 // is reported to be failed.
60 PendingSnapshot uint64
61
62 // RecentActive is true if the progress is recently active. Receiving any messages
63 // from the corresponding follower indicates the progress is active.
64 // RecentActive can be reset to false after an election timeout.
65 RecentActive bool
66
67 // inflights is a sliding window for the inflight messages.
68 // Each inflight message contains one or more log entries.
69 // The max number of entries per message is defined in raft config as MaxSizePerMsg.
70 // Thus inflight effectively limits both the number of inflight messages
71 // and the bandwidth each Progress can use.
72 // When inflights is full, no more message should be sent.
73 // When a leader sends out a message, the index of the last
74 // entry should be added to inflights. The index MUST be added
75 // into inflights in order.
76 // When a leader receives a reply, the previous inflights should
77 // be freed by calling inflights.freeTo with the index of the last
78 // received entry.
79 ins *inflights
80
81 // IsLearner is true if this progress is tracked for a learner.
82 IsLearner bool
83}
84
85func (pr *Progress) resetState(state ProgressStateType) {
86 pr.Paused = false
87 pr.PendingSnapshot = 0
88 pr.State = state
89 pr.ins.reset()
90}
91
92func (pr *Progress) becomeProbe() {
93 // If the original state is ProgressStateSnapshot, progress knows that
94 // the pending snapshot has been sent to this peer successfully, then
95 // probes from pendingSnapshot + 1.
96 if pr.State == ProgressStateSnapshot {
97 pendingSnapshot := pr.PendingSnapshot
98 pr.resetState(ProgressStateProbe)
99 pr.Next = max(pr.Match+1, pendingSnapshot+1)
100 } else {
101 pr.resetState(ProgressStateProbe)
102 pr.Next = pr.Match + 1
103 }
104}
105
106func (pr *Progress) becomeReplicate() {
107 pr.resetState(ProgressStateReplicate)
108 pr.Next = pr.Match + 1
109}
110
111func (pr *Progress) becomeSnapshot(snapshoti uint64) {
112 pr.resetState(ProgressStateSnapshot)
113 pr.PendingSnapshot = snapshoti
114}
115
116// maybeUpdate returns false if the given n index comes from an outdated message.
117// Otherwise it updates the progress and returns true.
118func (pr *Progress) maybeUpdate(n uint64) bool {
119 var updated bool
120 if pr.Match < n {
121 pr.Match = n
122 updated = true
123 pr.resume()
124 }
125 if pr.Next < n+1 {
126 pr.Next = n + 1
127 }
128 return updated
129}
130
131func (pr *Progress) optimisticUpdate(n uint64) { pr.Next = n + 1 }
132
133// maybeDecrTo returns false if the given to index comes from an out of order message.
134// Otherwise it decreases the progress next index to min(rejected, last) and returns true.
135func (pr *Progress) maybeDecrTo(rejected, last uint64) bool {
136 if pr.State == ProgressStateReplicate {
137 // the rejection must be stale if the progress has matched and "rejected"
138 // is smaller than "match".
139 if rejected <= pr.Match {
140 return false
141 }
142 // directly decrease next to match + 1
143 pr.Next = pr.Match + 1
144 return true
145 }
146
147 // the rejection must be stale if "rejected" does not match next - 1
148 if pr.Next-1 != rejected {
149 return false
150 }
151
152 if pr.Next = min(rejected, last+1); pr.Next < 1 {
153 pr.Next = 1
154 }
155 pr.resume()
156 return true
157}
158
159func (pr *Progress) pause() { pr.Paused = true }
160func (pr *Progress) resume() { pr.Paused = false }
161
162// IsPaused returns whether sending log entries to this node has been
163// paused. A node may be paused because it has rejected recent
164// MsgApps, is currently waiting for a snapshot, or has reached the
165// MaxInflightMsgs limit.
166func (pr *Progress) IsPaused() bool {
167 switch pr.State {
168 case ProgressStateProbe:
169 return pr.Paused
170 case ProgressStateReplicate:
171 return pr.ins.full()
172 case ProgressStateSnapshot:
173 return true
174 default:
175 panic("unexpected state")
176 }
177}
178
179func (pr *Progress) snapshotFailure() { pr.PendingSnapshot = 0 }
180
181// needSnapshotAbort returns true if snapshot progress's Match
182// is equal or higher than the pendingSnapshot.
183func (pr *Progress) needSnapshotAbort() bool {
184 return pr.State == ProgressStateSnapshot && pr.Match >= pr.PendingSnapshot
185}
186
187func (pr *Progress) String() string {
188 return fmt.Sprintf("next = %d, match = %d, state = %s, waiting = %v, pendingSnapshot = %d", pr.Next, pr.Match, pr.State, pr.IsPaused(), pr.PendingSnapshot)
189}
190
191type inflights struct {
192 // the starting index in the buffer
193 start int
194 // number of inflights in the buffer
195 count int
196
197 // the size of the buffer
198 size int
199
200 // buffer contains the index of the last entry
201 // inside one message.
202 buffer []uint64
203}
204
205func newInflights(size int) *inflights {
206 return &inflights{
207 size: size,
208 }
209}
210
211// add adds an inflight into inflights
212func (in *inflights) add(inflight uint64) {
213 if in.full() {
214 panic("cannot add into a full inflights")
215 }
216 next := in.start + in.count
217 size := in.size
218 if next >= size {
219 next -= size
220 }
221 if next >= len(in.buffer) {
222 in.growBuf()
223 }
224 in.buffer[next] = inflight
225 in.count++
226}
227
228// grow the inflight buffer by doubling up to inflights.size. We grow on demand
229// instead of preallocating to inflights.size to handle systems which have
230// thousands of Raft groups per process.
231func (in *inflights) growBuf() {
232 newSize := len(in.buffer) * 2
233 if newSize == 0 {
234 newSize = 1
235 } else if newSize > in.size {
236 newSize = in.size
237 }
238 newBuffer := make([]uint64, newSize)
239 copy(newBuffer, in.buffer)
240 in.buffer = newBuffer
241}
242
243// freeTo frees the inflights smaller or equal to the given `to` flight.
244func (in *inflights) freeTo(to uint64) {
245 if in.count == 0 || to < in.buffer[in.start] {
246 // out of the left side of the window
247 return
248 }
249
250 idx := in.start
251 var i int
252 for i = 0; i < in.count; i++ {
253 if to < in.buffer[idx] { // found the first large inflight
254 break
255 }
256
257 // increase index and maybe rotate
258 size := in.size
259 if idx++; idx >= size {
260 idx -= size
261 }
262 }
263 // free i inflights and set new start index
264 in.count -= i
265 in.start = idx
266 if in.count == 0 {
267 // inflights is empty, reset the start index so that we don't grow the
268 // buffer unnecessarily.
269 in.start = 0
270 }
271}
272
273func (in *inflights) freeFirstOne() { in.freeTo(in.buffer[in.start]) }
274
275// full returns true if the inflights is full.
276func (in *inflights) full() bool {
277 return in.count == in.size
278}
279
280// resets frees all inflights.
281func (in *inflights) reset() {
282 in.count = 0
283 in.start = 0
284}