Stephane Barbarie | 260a563 | 2019-02-26 16:12:49 -0500 | [diff] [blame] | 1 | // Copyright 2015 The etcd Authors |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | package raft |
| 16 | |
| 17 | import "fmt" |
| 18 | |
| 19 | const ( |
| 20 | ProgressStateProbe ProgressStateType = iota |
| 21 | ProgressStateReplicate |
| 22 | ProgressStateSnapshot |
| 23 | ) |
| 24 | |
| 25 | type ProgressStateType uint64 |
| 26 | |
| 27 | var prstmap = [...]string{ |
| 28 | "ProgressStateProbe", |
| 29 | "ProgressStateReplicate", |
| 30 | "ProgressStateSnapshot", |
| 31 | } |
| 32 | |
| 33 | func (st ProgressStateType) String() string { return prstmap[uint64(st)] } |
| 34 | |
| 35 | // Progress represents a follower’s progress in the view of the leader. Leader maintains |
| 36 | // progresses of all followers, and sends entries to the follower based on its progress. |
| 37 | type Progress struct { |
| 38 | Match, Next uint64 |
| 39 | // State defines how the leader should interact with the follower. |
| 40 | // |
| 41 | // When in ProgressStateProbe, leader sends at most one replication message |
| 42 | // per heartbeat interval. It also probes actual progress of the follower. |
| 43 | // |
| 44 | // When in ProgressStateReplicate, leader optimistically increases next |
| 45 | // to the latest entry sent after sending replication message. This is |
| 46 | // an optimized state for fast replicating log entries to the follower. |
| 47 | // |
| 48 | // When in ProgressStateSnapshot, leader should have sent out snapshot |
| 49 | // before and stops sending any replication message. |
| 50 | State ProgressStateType |
| 51 | |
| 52 | // Paused is used in ProgressStateProbe. |
| 53 | // When Paused is true, raft should pause sending replication message to this peer. |
| 54 | Paused bool |
| 55 | // PendingSnapshot is used in ProgressStateSnapshot. |
| 56 | // If there is a pending snapshot, the pendingSnapshot will be set to the |
| 57 | // index of the snapshot. If pendingSnapshot is set, the replication process of |
| 58 | // this Progress will be paused. raft will not resend snapshot until the pending one |
| 59 | // is reported to be failed. |
| 60 | PendingSnapshot uint64 |
| 61 | |
| 62 | // RecentActive is true if the progress is recently active. Receiving any messages |
| 63 | // from the corresponding follower indicates the progress is active. |
| 64 | // RecentActive can be reset to false after an election timeout. |
| 65 | RecentActive bool |
| 66 | |
| 67 | // inflights is a sliding window for the inflight messages. |
| 68 | // Each inflight message contains one or more log entries. |
| 69 | // The max number of entries per message is defined in raft config as MaxSizePerMsg. |
| 70 | // Thus inflight effectively limits both the number of inflight messages |
| 71 | // and the bandwidth each Progress can use. |
| 72 | // When inflights is full, no more message should be sent. |
| 73 | // When a leader sends out a message, the index of the last |
| 74 | // entry should be added to inflights. The index MUST be added |
| 75 | // into inflights in order. |
| 76 | // When a leader receives a reply, the previous inflights should |
| 77 | // be freed by calling inflights.freeTo with the index of the last |
| 78 | // received entry. |
| 79 | ins *inflights |
| 80 | |
| 81 | // IsLearner is true if this progress is tracked for a learner. |
| 82 | IsLearner bool |
| 83 | } |
| 84 | |
| 85 | func (pr *Progress) resetState(state ProgressStateType) { |
| 86 | pr.Paused = false |
| 87 | pr.PendingSnapshot = 0 |
| 88 | pr.State = state |
| 89 | pr.ins.reset() |
| 90 | } |
| 91 | |
| 92 | func (pr *Progress) becomeProbe() { |
| 93 | // If the original state is ProgressStateSnapshot, progress knows that |
| 94 | // the pending snapshot has been sent to this peer successfully, then |
| 95 | // probes from pendingSnapshot + 1. |
| 96 | if pr.State == ProgressStateSnapshot { |
| 97 | pendingSnapshot := pr.PendingSnapshot |
| 98 | pr.resetState(ProgressStateProbe) |
| 99 | pr.Next = max(pr.Match+1, pendingSnapshot+1) |
| 100 | } else { |
| 101 | pr.resetState(ProgressStateProbe) |
| 102 | pr.Next = pr.Match + 1 |
| 103 | } |
| 104 | } |
| 105 | |
| 106 | func (pr *Progress) becomeReplicate() { |
| 107 | pr.resetState(ProgressStateReplicate) |
| 108 | pr.Next = pr.Match + 1 |
| 109 | } |
| 110 | |
| 111 | func (pr *Progress) becomeSnapshot(snapshoti uint64) { |
| 112 | pr.resetState(ProgressStateSnapshot) |
| 113 | pr.PendingSnapshot = snapshoti |
| 114 | } |
| 115 | |
| 116 | // maybeUpdate returns false if the given n index comes from an outdated message. |
| 117 | // Otherwise it updates the progress and returns true. |
| 118 | func (pr *Progress) maybeUpdate(n uint64) bool { |
| 119 | var updated bool |
| 120 | if pr.Match < n { |
| 121 | pr.Match = n |
| 122 | updated = true |
| 123 | pr.resume() |
| 124 | } |
| 125 | if pr.Next < n+1 { |
| 126 | pr.Next = n + 1 |
| 127 | } |
| 128 | return updated |
| 129 | } |
| 130 | |
| 131 | func (pr *Progress) optimisticUpdate(n uint64) { pr.Next = n + 1 } |
| 132 | |
| 133 | // maybeDecrTo returns false if the given to index comes from an out of order message. |
| 134 | // Otherwise it decreases the progress next index to min(rejected, last) and returns true. |
| 135 | func (pr *Progress) maybeDecrTo(rejected, last uint64) bool { |
| 136 | if pr.State == ProgressStateReplicate { |
| 137 | // the rejection must be stale if the progress has matched and "rejected" |
| 138 | // is smaller than "match". |
| 139 | if rejected <= pr.Match { |
| 140 | return false |
| 141 | } |
| 142 | // directly decrease next to match + 1 |
| 143 | pr.Next = pr.Match + 1 |
| 144 | return true |
| 145 | } |
| 146 | |
| 147 | // the rejection must be stale if "rejected" does not match next - 1 |
| 148 | if pr.Next-1 != rejected { |
| 149 | return false |
| 150 | } |
| 151 | |
| 152 | if pr.Next = min(rejected, last+1); pr.Next < 1 { |
| 153 | pr.Next = 1 |
| 154 | } |
| 155 | pr.resume() |
| 156 | return true |
| 157 | } |
| 158 | |
| 159 | func (pr *Progress) pause() { pr.Paused = true } |
| 160 | func (pr *Progress) resume() { pr.Paused = false } |
| 161 | |
| 162 | // IsPaused returns whether sending log entries to this node has been |
| 163 | // paused. A node may be paused because it has rejected recent |
| 164 | // MsgApps, is currently waiting for a snapshot, or has reached the |
| 165 | // MaxInflightMsgs limit. |
| 166 | func (pr *Progress) IsPaused() bool { |
| 167 | switch pr.State { |
| 168 | case ProgressStateProbe: |
| 169 | return pr.Paused |
| 170 | case ProgressStateReplicate: |
| 171 | return pr.ins.full() |
| 172 | case ProgressStateSnapshot: |
| 173 | return true |
| 174 | default: |
| 175 | panic("unexpected state") |
| 176 | } |
| 177 | } |
| 178 | |
| 179 | func (pr *Progress) snapshotFailure() { pr.PendingSnapshot = 0 } |
| 180 | |
| 181 | // needSnapshotAbort returns true if snapshot progress's Match |
| 182 | // is equal or higher than the pendingSnapshot. |
| 183 | func (pr *Progress) needSnapshotAbort() bool { |
| 184 | return pr.State == ProgressStateSnapshot && pr.Match >= pr.PendingSnapshot |
| 185 | } |
| 186 | |
| 187 | func (pr *Progress) String() string { |
| 188 | return fmt.Sprintf("next = %d, match = %d, state = %s, waiting = %v, pendingSnapshot = %d", pr.Next, pr.Match, pr.State, pr.IsPaused(), pr.PendingSnapshot) |
| 189 | } |
| 190 | |
| 191 | type inflights struct { |
| 192 | // the starting index in the buffer |
| 193 | start int |
| 194 | // number of inflights in the buffer |
| 195 | count int |
| 196 | |
| 197 | // the size of the buffer |
| 198 | size int |
| 199 | |
| 200 | // buffer contains the index of the last entry |
| 201 | // inside one message. |
| 202 | buffer []uint64 |
| 203 | } |
| 204 | |
| 205 | func newInflights(size int) *inflights { |
| 206 | return &inflights{ |
| 207 | size: size, |
| 208 | } |
| 209 | } |
| 210 | |
| 211 | // add adds an inflight into inflights |
| 212 | func (in *inflights) add(inflight uint64) { |
| 213 | if in.full() { |
| 214 | panic("cannot add into a full inflights") |
| 215 | } |
| 216 | next := in.start + in.count |
| 217 | size := in.size |
| 218 | if next >= size { |
| 219 | next -= size |
| 220 | } |
| 221 | if next >= len(in.buffer) { |
| 222 | in.growBuf() |
| 223 | } |
| 224 | in.buffer[next] = inflight |
| 225 | in.count++ |
| 226 | } |
| 227 | |
| 228 | // grow the inflight buffer by doubling up to inflights.size. We grow on demand |
| 229 | // instead of preallocating to inflights.size to handle systems which have |
| 230 | // thousands of Raft groups per process. |
| 231 | func (in *inflights) growBuf() { |
| 232 | newSize := len(in.buffer) * 2 |
| 233 | if newSize == 0 { |
| 234 | newSize = 1 |
| 235 | } else if newSize > in.size { |
| 236 | newSize = in.size |
| 237 | } |
| 238 | newBuffer := make([]uint64, newSize) |
| 239 | copy(newBuffer, in.buffer) |
| 240 | in.buffer = newBuffer |
| 241 | } |
| 242 | |
| 243 | // freeTo frees the inflights smaller or equal to the given `to` flight. |
| 244 | func (in *inflights) freeTo(to uint64) { |
| 245 | if in.count == 0 || to < in.buffer[in.start] { |
| 246 | // out of the left side of the window |
| 247 | return |
| 248 | } |
| 249 | |
| 250 | idx := in.start |
| 251 | var i int |
| 252 | for i = 0; i < in.count; i++ { |
| 253 | if to < in.buffer[idx] { // found the first large inflight |
| 254 | break |
| 255 | } |
| 256 | |
| 257 | // increase index and maybe rotate |
| 258 | size := in.size |
| 259 | if idx++; idx >= size { |
| 260 | idx -= size |
| 261 | } |
| 262 | } |
| 263 | // free i inflights and set new start index |
| 264 | in.count -= i |
| 265 | in.start = idx |
| 266 | if in.count == 0 { |
| 267 | // inflights is empty, reset the start index so that we don't grow the |
| 268 | // buffer unnecessarily. |
| 269 | in.start = 0 |
| 270 | } |
| 271 | } |
| 272 | |
| 273 | func (in *inflights) freeFirstOne() { in.freeTo(in.buffer[in.start]) } |
| 274 | |
| 275 | // full returns true if the inflights is full. |
| 276 | func (in *inflights) full() bool { |
| 277 | return in.count == in.size |
| 278 | } |
| 279 | |
| 280 | // resets frees all inflights. |
| 281 | func (in *inflights) reset() { |
| 282 | in.count = 0 |
| 283 | in.start = 0 |
| 284 | } |