Scott Baker | 611f6bd | 2019-10-18 13:45:19 -0700 | [diff] [blame] | 1 | // Copyright 2019 The etcd Authors |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | package confchange |
| 16 | |
| 17 | import ( |
| 18 | "errors" |
| 19 | "fmt" |
| 20 | "strings" |
| 21 | |
| 22 | "go.etcd.io/etcd/raft/quorum" |
| 23 | pb "go.etcd.io/etcd/raft/raftpb" |
| 24 | "go.etcd.io/etcd/raft/tracker" |
| 25 | ) |
| 26 | |
| 27 | // Changer facilitates configuration changes. It exposes methods to handle |
| 28 | // simple and joint consensus while performing the proper validation that allows |
| 29 | // refusing invalid configuration changes before they affect the active |
| 30 | // configuration. |
| 31 | type Changer struct { |
| 32 | Tracker tracker.ProgressTracker |
| 33 | LastIndex uint64 |
| 34 | } |
| 35 | |
| 36 | // EnterJoint verifies that the outgoing (=right) majority config of the joint |
| 37 | // config is empty and initializes it with a copy of the incoming (=left) |
| 38 | // majority config. That is, it transitions from |
| 39 | // |
| 40 | // (1 2 3)&&() |
| 41 | // to |
| 42 | // (1 2 3)&&(1 2 3). |
| 43 | // |
| 44 | // The supplied changes are then applied to the incoming majority config, |
| 45 | // resulting in a joint configuration that in terms of the Raft thesis[1] |
| 46 | // (Section 4.3) corresponds to `C_{new,old}`. |
| 47 | // |
| 48 | // [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf |
| 49 | func (c Changer) EnterJoint(autoLeave bool, ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) { |
| 50 | cfg, prs, err := c.checkAndCopy() |
| 51 | if err != nil { |
| 52 | return c.err(err) |
| 53 | } |
| 54 | if joint(cfg) { |
| 55 | err := errors.New("config is already joint") |
| 56 | return c.err(err) |
| 57 | } |
| 58 | if len(incoming(cfg.Voters)) == 0 { |
| 59 | // We allow adding nodes to an empty config for convenience (testing and |
| 60 | // bootstrap), but you can't enter a joint state. |
| 61 | err := errors.New("can't make a zero-voter config joint") |
| 62 | return c.err(err) |
| 63 | } |
| 64 | // Clear the outgoing config. |
| 65 | *outgoingPtr(&cfg.Voters) = quorum.MajorityConfig{} |
| 66 | // Copy incoming to outgoing. |
| 67 | for id := range incoming(cfg.Voters) { |
| 68 | outgoing(cfg.Voters)[id] = struct{}{} |
| 69 | } |
| 70 | |
| 71 | if err := c.apply(&cfg, prs, ccs...); err != nil { |
| 72 | return c.err(err) |
| 73 | } |
| 74 | cfg.AutoLeave = autoLeave |
| 75 | return checkAndReturn(cfg, prs) |
| 76 | } |
| 77 | |
| 78 | // LeaveJoint transitions out of a joint configuration. It is an error to call |
| 79 | // this method if the configuration is not joint, i.e. if the outgoing majority |
| 80 | // config Voters[1] is empty. |
| 81 | // |
| 82 | // The outgoing majority config of the joint configuration will be removed, |
| 83 | // that is, the incoming config is promoted as the sole decision maker. In the |
| 84 | // notation of the Raft thesis[1] (Section 4.3), this method transitions from |
| 85 | // `C_{new,old}` into `C_new`. |
| 86 | // |
| 87 | // At the same time, any staged learners (LearnersNext) the addition of which |
| 88 | // was held back by an overlapping voter in the former outgoing config will be |
| 89 | // inserted into Learners. |
| 90 | // |
| 91 | // [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf |
| 92 | func (c Changer) LeaveJoint() (tracker.Config, tracker.ProgressMap, error) { |
| 93 | cfg, prs, err := c.checkAndCopy() |
| 94 | if err != nil { |
| 95 | return c.err(err) |
| 96 | } |
| 97 | if !joint(cfg) { |
| 98 | err := errors.New("can't leave a non-joint config") |
| 99 | return c.err(err) |
| 100 | } |
| 101 | if len(outgoing(cfg.Voters)) == 0 { |
| 102 | err := fmt.Errorf("configuration is not joint: %v", cfg) |
| 103 | return c.err(err) |
| 104 | } |
| 105 | for id := range cfg.LearnersNext { |
| 106 | nilAwareAdd(&cfg.Learners, id) |
| 107 | prs[id].IsLearner = true |
| 108 | } |
| 109 | cfg.LearnersNext = nil |
| 110 | |
| 111 | for id := range outgoing(cfg.Voters) { |
| 112 | _, isVoter := incoming(cfg.Voters)[id] |
| 113 | _, isLearner := cfg.Learners[id] |
| 114 | |
| 115 | if !isVoter && !isLearner { |
| 116 | delete(prs, id) |
| 117 | } |
| 118 | } |
| 119 | *outgoingPtr(&cfg.Voters) = nil |
| 120 | cfg.AutoLeave = false |
| 121 | |
| 122 | return checkAndReturn(cfg, prs) |
| 123 | } |
| 124 | |
| 125 | // Simple carries out a series of configuration changes that (in aggregate) |
| 126 | // mutates the incoming majority config Voters[0] by at most one. This method |
| 127 | // will return an error if that is not the case, if the resulting quorum is |
| 128 | // zero, or if the configuration is in a joint state (i.e. if there is an |
| 129 | // outgoing configuration). |
| 130 | func (c Changer) Simple(ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) { |
| 131 | cfg, prs, err := c.checkAndCopy() |
| 132 | if err != nil { |
| 133 | return c.err(err) |
| 134 | } |
| 135 | if joint(cfg) { |
| 136 | err := errors.New("can't apply simple config change in joint config") |
| 137 | return c.err(err) |
| 138 | } |
| 139 | if err := c.apply(&cfg, prs, ccs...); err != nil { |
| 140 | return c.err(err) |
| 141 | } |
| 142 | if n := symdiff(incoming(c.Tracker.Voters), incoming(cfg.Voters)); n > 1 { |
| 143 | return tracker.Config{}, nil, errors.New("more than one voter changed without entering joint config") |
| 144 | } |
| 145 | if err := checkInvariants(cfg, prs); err != nil { |
| 146 | return tracker.Config{}, tracker.ProgressMap{}, nil |
| 147 | } |
| 148 | |
| 149 | return checkAndReturn(cfg, prs) |
| 150 | } |
| 151 | |
| 152 | // apply a change to the configuration. By convention, changes to voters are |
| 153 | // always made to the incoming majority config Voters[0]. Voters[1] is either |
| 154 | // empty or preserves the outgoing majority configuration while in a joint state. |
| 155 | func (c Changer) apply(cfg *tracker.Config, prs tracker.ProgressMap, ccs ...pb.ConfChangeSingle) error { |
| 156 | for _, cc := range ccs { |
| 157 | if cc.NodeID == 0 { |
| 158 | // etcd replaces the NodeID with zero if it decides (downstream of |
| 159 | // raft) to not apply a change, so we have to have explicit code |
| 160 | // here to ignore these. |
| 161 | continue |
| 162 | } |
| 163 | switch cc.Type { |
| 164 | case pb.ConfChangeAddNode: |
| 165 | c.makeVoter(cfg, prs, cc.NodeID) |
| 166 | case pb.ConfChangeAddLearnerNode: |
| 167 | c.makeLearner(cfg, prs, cc.NodeID) |
| 168 | case pb.ConfChangeRemoveNode: |
| 169 | c.remove(cfg, prs, cc.NodeID) |
| 170 | case pb.ConfChangeUpdateNode: |
| 171 | default: |
| 172 | return fmt.Errorf("unexpected conf type %d", cc.Type) |
| 173 | } |
| 174 | } |
| 175 | if len(incoming(cfg.Voters)) == 0 { |
| 176 | return errors.New("removed all voters") |
| 177 | } |
| 178 | return nil |
| 179 | } |
| 180 | |
| 181 | // makeVoter adds or promotes the given ID to be a voter in the incoming |
| 182 | // majority config. |
| 183 | func (c Changer) makeVoter(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) { |
| 184 | pr := prs[id] |
| 185 | if pr == nil { |
| 186 | c.initProgress(cfg, prs, id, false /* isLearner */) |
| 187 | return |
| 188 | } |
| 189 | |
| 190 | pr.IsLearner = false |
| 191 | nilAwareDelete(&cfg.Learners, id) |
| 192 | nilAwareDelete(&cfg.LearnersNext, id) |
| 193 | incoming(cfg.Voters)[id] = struct{}{} |
| 194 | return |
| 195 | } |
| 196 | |
| 197 | // makeLearner makes the given ID a learner or stages it to be a learner once |
| 198 | // an active joint configuration is exited. |
| 199 | // |
| 200 | // The former happens when the peer is not a part of the outgoing config, in |
| 201 | // which case we either add a new learner or demote a voter in the incoming |
| 202 | // config. |
| 203 | // |
| 204 | // The latter case occurs when the configuration is joint and the peer is a |
| 205 | // voter in the outgoing config. In that case, we do not want to add the peer |
| 206 | // as a learner because then we'd have to track a peer as a voter and learner |
| 207 | // simultaneously. Instead, we add the learner to LearnersNext, so that it will |
| 208 | // be added to Learners the moment the outgoing config is removed by |
| 209 | // LeaveJoint(). |
| 210 | func (c Changer) makeLearner(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) { |
| 211 | pr := prs[id] |
| 212 | if pr == nil { |
| 213 | c.initProgress(cfg, prs, id, true /* isLearner */) |
| 214 | return |
| 215 | } |
| 216 | if pr.IsLearner { |
| 217 | return |
| 218 | } |
| 219 | // Remove any existing voter in the incoming config... |
| 220 | c.remove(cfg, prs, id) |
| 221 | // ... but save the Progress. |
| 222 | prs[id] = pr |
| 223 | // Use LearnersNext if we can't add the learner to Learners directly, i.e. |
| 224 | // if the peer is still tracked as a voter in the outgoing config. It will |
| 225 | // be turned into a learner in LeaveJoint(). |
| 226 | // |
| 227 | // Otherwise, add a regular learner right away. |
| 228 | if _, onRight := outgoing(cfg.Voters)[id]; onRight { |
| 229 | nilAwareAdd(&cfg.LearnersNext, id) |
| 230 | } else { |
| 231 | pr.IsLearner = true |
| 232 | nilAwareAdd(&cfg.Learners, id) |
| 233 | } |
| 234 | } |
| 235 | |
| 236 | // remove this peer as a voter or learner from the incoming config. |
| 237 | func (c Changer) remove(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) { |
| 238 | if _, ok := prs[id]; !ok { |
| 239 | return |
| 240 | } |
| 241 | |
| 242 | delete(incoming(cfg.Voters), id) |
| 243 | nilAwareDelete(&cfg.Learners, id) |
| 244 | nilAwareDelete(&cfg.LearnersNext, id) |
| 245 | |
| 246 | // If the peer is still a voter in the outgoing config, keep the Progress. |
| 247 | if _, onRight := outgoing(cfg.Voters)[id]; !onRight { |
| 248 | delete(prs, id) |
| 249 | } |
| 250 | } |
| 251 | |
| 252 | // initProgress initializes a new progress for the given node or learner. |
| 253 | func (c Changer) initProgress(cfg *tracker.Config, prs tracker.ProgressMap, id uint64, isLearner bool) { |
| 254 | if !isLearner { |
| 255 | incoming(cfg.Voters)[id] = struct{}{} |
| 256 | } else { |
| 257 | nilAwareAdd(&cfg.Learners, id) |
| 258 | } |
| 259 | prs[id] = &tracker.Progress{ |
| 260 | // Initializing the Progress with the last index means that the follower |
| 261 | // can be probed (with the last index). |
| 262 | // |
| 263 | // TODO(tbg): seems awfully optimistic. Using the first index would be |
| 264 | // better. The general expectation here is that the follower has no log |
| 265 | // at all (and will thus likely need a snapshot), though the app may |
| 266 | // have applied a snapshot out of band before adding the replica (thus |
| 267 | // making the first index the better choice). |
| 268 | Next: c.LastIndex, |
| 269 | Match: 0, |
| 270 | Inflights: tracker.NewInflights(c.Tracker.MaxInflight), |
| 271 | IsLearner: isLearner, |
| 272 | // When a node is first added, we should mark it as recently active. |
| 273 | // Otherwise, CheckQuorum may cause us to step down if it is invoked |
| 274 | // before the added node has had a chance to communicate with us. |
| 275 | RecentActive: true, |
| 276 | } |
| 277 | } |
| 278 | |
| 279 | // checkInvariants makes sure that the config and progress are compatible with |
| 280 | // each other. This is used to check both what the Changer is initialized with, |
| 281 | // as well as what it returns. |
| 282 | func checkInvariants(cfg tracker.Config, prs tracker.ProgressMap) error { |
| 283 | // NB: intentionally allow the empty config. In production we'll never see a |
| 284 | // non-empty config (we prevent it from being created) but we will need to |
| 285 | // be able to *create* an initial config, for example during bootstrap (or |
| 286 | // during tests). Instead of having to hand-code this, we allow |
| 287 | // transitioning from an empty config into any other legal and non-empty |
| 288 | // config. |
| 289 | for _, ids := range []map[uint64]struct{}{ |
| 290 | cfg.Voters.IDs(), |
| 291 | cfg.Learners, |
| 292 | cfg.LearnersNext, |
| 293 | } { |
| 294 | for id := range ids { |
| 295 | if _, ok := prs[id]; !ok { |
| 296 | return fmt.Errorf("no progress for %d", id) |
| 297 | } |
| 298 | } |
| 299 | } |
| 300 | |
| 301 | // Any staged learner was staged because it could not be directly added due |
| 302 | // to a conflicting voter in the outgoing config. |
| 303 | for id := range cfg.LearnersNext { |
| 304 | if _, ok := outgoing(cfg.Voters)[id]; !ok { |
| 305 | return fmt.Errorf("%d is in LearnersNext, but not Voters[1]", id) |
| 306 | } |
| 307 | if prs[id].IsLearner { |
| 308 | return fmt.Errorf("%d is in LearnersNext, but is already marked as learner", id) |
| 309 | } |
| 310 | } |
| 311 | // Conversely Learners and Voters doesn't intersect at all. |
| 312 | for id := range cfg.Learners { |
| 313 | if _, ok := outgoing(cfg.Voters)[id]; ok { |
| 314 | return fmt.Errorf("%d is in Learners and Voters[1]", id) |
| 315 | } |
| 316 | if _, ok := incoming(cfg.Voters)[id]; ok { |
| 317 | return fmt.Errorf("%d is in Learners and Voters[0]", id) |
| 318 | } |
| 319 | if !prs[id].IsLearner { |
| 320 | return fmt.Errorf("%d is in Learners, but is not marked as learner", id) |
| 321 | } |
| 322 | } |
| 323 | |
| 324 | if !joint(cfg) { |
| 325 | // We enforce that empty maps are nil instead of zero. |
| 326 | if outgoing(cfg.Voters) != nil { |
| 327 | return fmt.Errorf("Voters[1] must be nil when not joint") |
| 328 | } |
| 329 | if cfg.LearnersNext != nil { |
| 330 | return fmt.Errorf("LearnersNext must be nil when not joint") |
| 331 | } |
| 332 | if cfg.AutoLeave { |
| 333 | return fmt.Errorf("AutoLeave must be false when not joint") |
| 334 | } |
| 335 | } |
| 336 | |
| 337 | return nil |
| 338 | } |
| 339 | |
| 340 | // checkAndCopy copies the tracker's config and progress map (deeply enough for |
| 341 | // the purposes of the Changer) and returns those copies. It returns an error |
| 342 | // if checkInvariants does. |
| 343 | func (c Changer) checkAndCopy() (tracker.Config, tracker.ProgressMap, error) { |
| 344 | cfg := c.Tracker.Config.Clone() |
| 345 | prs := tracker.ProgressMap{} |
| 346 | |
| 347 | for id, pr := range c.Tracker.Progress { |
| 348 | // A shallow copy is enough because we only mutate the Learner field. |
| 349 | ppr := *pr |
| 350 | prs[id] = &ppr |
| 351 | } |
| 352 | return checkAndReturn(cfg, prs) |
| 353 | } |
| 354 | |
| 355 | // checkAndReturn calls checkInvariants on the input and returns either the |
| 356 | // resulting error or the input. |
| 357 | func checkAndReturn(cfg tracker.Config, prs tracker.ProgressMap) (tracker.Config, tracker.ProgressMap, error) { |
| 358 | if err := checkInvariants(cfg, prs); err != nil { |
| 359 | return tracker.Config{}, tracker.ProgressMap{}, err |
| 360 | } |
| 361 | return cfg, prs, nil |
| 362 | } |
| 363 | |
| 364 | // err returns zero values and an error. |
| 365 | func (c Changer) err(err error) (tracker.Config, tracker.ProgressMap, error) { |
| 366 | return tracker.Config{}, nil, err |
| 367 | } |
| 368 | |
| 369 | // nilAwareAdd populates a map entry, creating the map if necessary. |
| 370 | func nilAwareAdd(m *map[uint64]struct{}, id uint64) { |
| 371 | if *m == nil { |
| 372 | *m = map[uint64]struct{}{} |
| 373 | } |
| 374 | (*m)[id] = struct{}{} |
| 375 | } |
| 376 | |
| 377 | // nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after. |
| 378 | func nilAwareDelete(m *map[uint64]struct{}, id uint64) { |
| 379 | if *m == nil { |
| 380 | return |
| 381 | } |
| 382 | delete(*m, id) |
| 383 | if len(*m) == 0 { |
| 384 | *m = nil |
| 385 | } |
| 386 | } |
| 387 | |
| 388 | // symdiff returns the count of the symmetric difference between the sets of |
| 389 | // uint64s, i.e. len( (l - r) \union (r - l)). |
| 390 | func symdiff(l, r map[uint64]struct{}) int { |
| 391 | var n int |
| 392 | pairs := [][2]quorum.MajorityConfig{ |
| 393 | {l, r}, // count elems in l but not in r |
| 394 | {r, l}, // count elems in r but not in l |
| 395 | } |
| 396 | for _, p := range pairs { |
| 397 | for id := range p[0] { |
| 398 | if _, ok := p[1][id]; !ok { |
| 399 | n++ |
| 400 | } |
| 401 | } |
| 402 | } |
| 403 | return n |
| 404 | } |
| 405 | |
| 406 | func joint(cfg tracker.Config) bool { |
| 407 | return len(outgoing(cfg.Voters)) > 0 |
| 408 | } |
| 409 | |
| 410 | func incoming(voters quorum.JointConfig) quorum.MajorityConfig { return voters[0] } |
| 411 | func outgoing(voters quorum.JointConfig) quorum.MajorityConfig { return voters[1] } |
| 412 | func outgoingPtr(voters *quorum.JointConfig) *quorum.MajorityConfig { return &voters[1] } |
| 413 | |
| 414 | // Describe prints the type and NodeID of the configuration changes as a |
| 415 | // space-delimited string. |
| 416 | func Describe(ccs ...pb.ConfChangeSingle) string { |
| 417 | var buf strings.Builder |
| 418 | for _, cc := range ccs { |
| 419 | if buf.Len() > 0 { |
| 420 | buf.WriteByte(' ') |
| 421 | } |
| 422 | fmt.Fprintf(&buf, "%s(%d)", cc.Type, cc.NodeID) |
| 423 | } |
| 424 | return buf.String() |
| 425 | } |