| // Copyright 2019 The etcd Authors |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package confchange |
| |
| import ( |
| "errors" |
| "fmt" |
| "strings" |
| |
| "go.etcd.io/etcd/raft/quorum" |
| pb "go.etcd.io/etcd/raft/raftpb" |
| "go.etcd.io/etcd/raft/tracker" |
| ) |
| |
| // Changer facilitates configuration changes. It exposes methods to handle |
| // simple and joint consensus while performing the proper validation that allows |
| // refusing invalid configuration changes before they affect the active |
| // configuration. |
| type Changer struct { |
| Tracker tracker.ProgressTracker |
| LastIndex uint64 |
| } |
| |
| // EnterJoint verifies that the outgoing (=right) majority config of the joint |
| // config is empty and initializes it with a copy of the incoming (=left) |
| // majority config. That is, it transitions from |
| // |
| // (1 2 3)&&() |
| // to |
| // (1 2 3)&&(1 2 3). |
| // |
| // The supplied changes are then applied to the incoming majority config, |
| // resulting in a joint configuration that in terms of the Raft thesis[1] |
| // (Section 4.3) corresponds to `C_{new,old}`. |
| // |
| // [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf |
| func (c Changer) EnterJoint(autoLeave bool, ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) { |
| cfg, prs, err := c.checkAndCopy() |
| if err != nil { |
| return c.err(err) |
| } |
| if joint(cfg) { |
| err := errors.New("config is already joint") |
| return c.err(err) |
| } |
| if len(incoming(cfg.Voters)) == 0 { |
| // We allow adding nodes to an empty config for convenience (testing and |
| // bootstrap), but you can't enter a joint state. |
| err := errors.New("can't make a zero-voter config joint") |
| return c.err(err) |
| } |
| // Clear the outgoing config. |
| *outgoingPtr(&cfg.Voters) = quorum.MajorityConfig{} |
| // Copy incoming to outgoing. |
| for id := range incoming(cfg.Voters) { |
| outgoing(cfg.Voters)[id] = struct{}{} |
| } |
| |
| if err := c.apply(&cfg, prs, ccs...); err != nil { |
| return c.err(err) |
| } |
| cfg.AutoLeave = autoLeave |
| return checkAndReturn(cfg, prs) |
| } |
| |
| // LeaveJoint transitions out of a joint configuration. It is an error to call |
| // this method if the configuration is not joint, i.e. if the outgoing majority |
| // config Voters[1] is empty. |
| // |
| // The outgoing majority config of the joint configuration will be removed, |
| // that is, the incoming config is promoted as the sole decision maker. In the |
| // notation of the Raft thesis[1] (Section 4.3), this method transitions from |
| // `C_{new,old}` into `C_new`. |
| // |
| // At the same time, any staged learners (LearnersNext) the addition of which |
| // was held back by an overlapping voter in the former outgoing config will be |
| // inserted into Learners. |
| // |
| // [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf |
| func (c Changer) LeaveJoint() (tracker.Config, tracker.ProgressMap, error) { |
| cfg, prs, err := c.checkAndCopy() |
| if err != nil { |
| return c.err(err) |
| } |
| if !joint(cfg) { |
| err := errors.New("can't leave a non-joint config") |
| return c.err(err) |
| } |
| if len(outgoing(cfg.Voters)) == 0 { |
| err := fmt.Errorf("configuration is not joint: %v", cfg) |
| return c.err(err) |
| } |
| for id := range cfg.LearnersNext { |
| nilAwareAdd(&cfg.Learners, id) |
| prs[id].IsLearner = true |
| } |
| cfg.LearnersNext = nil |
| |
| for id := range outgoing(cfg.Voters) { |
| _, isVoter := incoming(cfg.Voters)[id] |
| _, isLearner := cfg.Learners[id] |
| |
| if !isVoter && !isLearner { |
| delete(prs, id) |
| } |
| } |
| *outgoingPtr(&cfg.Voters) = nil |
| cfg.AutoLeave = false |
| |
| return checkAndReturn(cfg, prs) |
| } |
| |
| // Simple carries out a series of configuration changes that (in aggregate) |
| // mutates the incoming majority config Voters[0] by at most one. This method |
| // will return an error if that is not the case, if the resulting quorum is |
| // zero, or if the configuration is in a joint state (i.e. if there is an |
| // outgoing configuration). |
| func (c Changer) Simple(ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) { |
| cfg, prs, err := c.checkAndCopy() |
| if err != nil { |
| return c.err(err) |
| } |
| if joint(cfg) { |
| err := errors.New("can't apply simple config change in joint config") |
| return c.err(err) |
| } |
| if err := c.apply(&cfg, prs, ccs...); err != nil { |
| return c.err(err) |
| } |
| if n := symdiff(incoming(c.Tracker.Voters), incoming(cfg.Voters)); n > 1 { |
| return tracker.Config{}, nil, errors.New("more than one voter changed without entering joint config") |
| } |
| if err := checkInvariants(cfg, prs); err != nil { |
| return tracker.Config{}, tracker.ProgressMap{}, nil |
| } |
| |
| return checkAndReturn(cfg, prs) |
| } |
| |
| // apply a change to the configuration. By convention, changes to voters are |
| // always made to the incoming majority config Voters[0]. Voters[1] is either |
| // empty or preserves the outgoing majority configuration while in a joint state. |
| func (c Changer) apply(cfg *tracker.Config, prs tracker.ProgressMap, ccs ...pb.ConfChangeSingle) error { |
| for _, cc := range ccs { |
| if cc.NodeID == 0 { |
| // etcd replaces the NodeID with zero if it decides (downstream of |
| // raft) to not apply a change, so we have to have explicit code |
| // here to ignore these. |
| continue |
| } |
| switch cc.Type { |
| case pb.ConfChangeAddNode: |
| c.makeVoter(cfg, prs, cc.NodeID) |
| case pb.ConfChangeAddLearnerNode: |
| c.makeLearner(cfg, prs, cc.NodeID) |
| case pb.ConfChangeRemoveNode: |
| c.remove(cfg, prs, cc.NodeID) |
| case pb.ConfChangeUpdateNode: |
| default: |
| return fmt.Errorf("unexpected conf type %d", cc.Type) |
| } |
| } |
| if len(incoming(cfg.Voters)) == 0 { |
| return errors.New("removed all voters") |
| } |
| return nil |
| } |
| |
| // makeVoter adds or promotes the given ID to be a voter in the incoming |
| // majority config. |
| func (c Changer) makeVoter(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) { |
| pr := prs[id] |
| if pr == nil { |
| c.initProgress(cfg, prs, id, false /* isLearner */) |
| return |
| } |
| |
| pr.IsLearner = false |
| nilAwareDelete(&cfg.Learners, id) |
| nilAwareDelete(&cfg.LearnersNext, id) |
| incoming(cfg.Voters)[id] = struct{}{} |
| return |
| } |
| |
| // makeLearner makes the given ID a learner or stages it to be a learner once |
| // an active joint configuration is exited. |
| // |
| // The former happens when the peer is not a part of the outgoing config, in |
| // which case we either add a new learner or demote a voter in the incoming |
| // config. |
| // |
| // The latter case occurs when the configuration is joint and the peer is a |
| // voter in the outgoing config. In that case, we do not want to add the peer |
| // as a learner because then we'd have to track a peer as a voter and learner |
| // simultaneously. Instead, we add the learner to LearnersNext, so that it will |
| // be added to Learners the moment the outgoing config is removed by |
| // LeaveJoint(). |
| func (c Changer) makeLearner(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) { |
| pr := prs[id] |
| if pr == nil { |
| c.initProgress(cfg, prs, id, true /* isLearner */) |
| return |
| } |
| if pr.IsLearner { |
| return |
| } |
| // Remove any existing voter in the incoming config... |
| c.remove(cfg, prs, id) |
| // ... but save the Progress. |
| prs[id] = pr |
| // Use LearnersNext if we can't add the learner to Learners directly, i.e. |
| // if the peer is still tracked as a voter in the outgoing config. It will |
| // be turned into a learner in LeaveJoint(). |
| // |
| // Otherwise, add a regular learner right away. |
| if _, onRight := outgoing(cfg.Voters)[id]; onRight { |
| nilAwareAdd(&cfg.LearnersNext, id) |
| } else { |
| pr.IsLearner = true |
| nilAwareAdd(&cfg.Learners, id) |
| } |
| } |
| |
| // remove this peer as a voter or learner from the incoming config. |
| func (c Changer) remove(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) { |
| if _, ok := prs[id]; !ok { |
| return |
| } |
| |
| delete(incoming(cfg.Voters), id) |
| nilAwareDelete(&cfg.Learners, id) |
| nilAwareDelete(&cfg.LearnersNext, id) |
| |
| // If the peer is still a voter in the outgoing config, keep the Progress. |
| if _, onRight := outgoing(cfg.Voters)[id]; !onRight { |
| delete(prs, id) |
| } |
| } |
| |
| // initProgress initializes a new progress for the given node or learner. |
| func (c Changer) initProgress(cfg *tracker.Config, prs tracker.ProgressMap, id uint64, isLearner bool) { |
| if !isLearner { |
| incoming(cfg.Voters)[id] = struct{}{} |
| } else { |
| nilAwareAdd(&cfg.Learners, id) |
| } |
| prs[id] = &tracker.Progress{ |
| // Initializing the Progress with the last index means that the follower |
| // can be probed (with the last index). |
| // |
| // TODO(tbg): seems awfully optimistic. Using the first index would be |
| // better. The general expectation here is that the follower has no log |
| // at all (and will thus likely need a snapshot), though the app may |
| // have applied a snapshot out of band before adding the replica (thus |
| // making the first index the better choice). |
| Next: c.LastIndex, |
| Match: 0, |
| Inflights: tracker.NewInflights(c.Tracker.MaxInflight), |
| IsLearner: isLearner, |
| // When a node is first added, we should mark it as recently active. |
| // Otherwise, CheckQuorum may cause us to step down if it is invoked |
| // before the added node has had a chance to communicate with us. |
| RecentActive: true, |
| } |
| } |
| |
| // checkInvariants makes sure that the config and progress are compatible with |
| // each other. This is used to check both what the Changer is initialized with, |
| // as well as what it returns. |
| func checkInvariants(cfg tracker.Config, prs tracker.ProgressMap) error { |
| // NB: intentionally allow the empty config. In production we'll never see a |
| // non-empty config (we prevent it from being created) but we will need to |
| // be able to *create* an initial config, for example during bootstrap (or |
| // during tests). Instead of having to hand-code this, we allow |
| // transitioning from an empty config into any other legal and non-empty |
| // config. |
| for _, ids := range []map[uint64]struct{}{ |
| cfg.Voters.IDs(), |
| cfg.Learners, |
| cfg.LearnersNext, |
| } { |
| for id := range ids { |
| if _, ok := prs[id]; !ok { |
| return fmt.Errorf("no progress for %d", id) |
| } |
| } |
| } |
| |
| // Any staged learner was staged because it could not be directly added due |
| // to a conflicting voter in the outgoing config. |
| for id := range cfg.LearnersNext { |
| if _, ok := outgoing(cfg.Voters)[id]; !ok { |
| return fmt.Errorf("%d is in LearnersNext, but not Voters[1]", id) |
| } |
| if prs[id].IsLearner { |
| return fmt.Errorf("%d is in LearnersNext, but is already marked as learner", id) |
| } |
| } |
| // Conversely Learners and Voters doesn't intersect at all. |
| for id := range cfg.Learners { |
| if _, ok := outgoing(cfg.Voters)[id]; ok { |
| return fmt.Errorf("%d is in Learners and Voters[1]", id) |
| } |
| if _, ok := incoming(cfg.Voters)[id]; ok { |
| return fmt.Errorf("%d is in Learners and Voters[0]", id) |
| } |
| if !prs[id].IsLearner { |
| return fmt.Errorf("%d is in Learners, but is not marked as learner", id) |
| } |
| } |
| |
| if !joint(cfg) { |
| // We enforce that empty maps are nil instead of zero. |
| if outgoing(cfg.Voters) != nil { |
| return fmt.Errorf("Voters[1] must be nil when not joint") |
| } |
| if cfg.LearnersNext != nil { |
| return fmt.Errorf("LearnersNext must be nil when not joint") |
| } |
| if cfg.AutoLeave { |
| return fmt.Errorf("AutoLeave must be false when not joint") |
| } |
| } |
| |
| return nil |
| } |
| |
| // checkAndCopy copies the tracker's config and progress map (deeply enough for |
| // the purposes of the Changer) and returns those copies. It returns an error |
| // if checkInvariants does. |
| func (c Changer) checkAndCopy() (tracker.Config, tracker.ProgressMap, error) { |
| cfg := c.Tracker.Config.Clone() |
| prs := tracker.ProgressMap{} |
| |
| for id, pr := range c.Tracker.Progress { |
| // A shallow copy is enough because we only mutate the Learner field. |
| ppr := *pr |
| prs[id] = &ppr |
| } |
| return checkAndReturn(cfg, prs) |
| } |
| |
| // checkAndReturn calls checkInvariants on the input and returns either the |
| // resulting error or the input. |
| func checkAndReturn(cfg tracker.Config, prs tracker.ProgressMap) (tracker.Config, tracker.ProgressMap, error) { |
| if err := checkInvariants(cfg, prs); err != nil { |
| return tracker.Config{}, tracker.ProgressMap{}, err |
| } |
| return cfg, prs, nil |
| } |
| |
| // err returns zero values and an error. |
| func (c Changer) err(err error) (tracker.Config, tracker.ProgressMap, error) { |
| return tracker.Config{}, nil, err |
| } |
| |
| // nilAwareAdd populates a map entry, creating the map if necessary. |
| func nilAwareAdd(m *map[uint64]struct{}, id uint64) { |
| if *m == nil { |
| *m = map[uint64]struct{}{} |
| } |
| (*m)[id] = struct{}{} |
| } |
| |
| // nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after. |
| func nilAwareDelete(m *map[uint64]struct{}, id uint64) { |
| if *m == nil { |
| return |
| } |
| delete(*m, id) |
| if len(*m) == 0 { |
| *m = nil |
| } |
| } |
| |
| // symdiff returns the count of the symmetric difference between the sets of |
| // uint64s, i.e. len( (l - r) \union (r - l)). |
| func symdiff(l, r map[uint64]struct{}) int { |
| var n int |
| pairs := [][2]quorum.MajorityConfig{ |
| {l, r}, // count elems in l but not in r |
| {r, l}, // count elems in r but not in l |
| } |
| for _, p := range pairs { |
| for id := range p[0] { |
| if _, ok := p[1][id]; !ok { |
| n++ |
| } |
| } |
| } |
| return n |
| } |
| |
| func joint(cfg tracker.Config) bool { |
| return len(outgoing(cfg.Voters)) > 0 |
| } |
| |
| func incoming(voters quorum.JointConfig) quorum.MajorityConfig { return voters[0] } |
| func outgoing(voters quorum.JointConfig) quorum.MajorityConfig { return voters[1] } |
| func outgoingPtr(voters *quorum.JointConfig) *quorum.MajorityConfig { return &voters[1] } |
| |
| // Describe prints the type and NodeID of the configuration changes as a |
| // space-delimited string. |
| func Describe(ccs ...pb.ConfChangeSingle) string { |
| var buf strings.Builder |
| for _, cc := range ccs { |
| if buf.Len() > 0 { |
| buf.WriteByte(' ') |
| } |
| fmt.Fprintf(&buf, "%s(%d)", cc.Type, cc.NodeID) |
| } |
| return buf.String() |
| } |