blob: a0dc486df4f6fbe109a2fb05548803b85b4e5493 [file] [log] [blame]
Prince Pereirac1c21d62021-04-22 08:38:15 +00001// Copyright 2019 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package confchange
16
17import (
18 "errors"
19 "fmt"
20 "strings"
21
22 "go.etcd.io/etcd/raft/quorum"
23 pb "go.etcd.io/etcd/raft/raftpb"
24 "go.etcd.io/etcd/raft/tracker"
25)
26
27// Changer facilitates configuration changes. It exposes methods to handle
28// simple and joint consensus while performing the proper validation that allows
29// refusing invalid configuration changes before they affect the active
30// configuration.
31type Changer struct {
32 Tracker tracker.ProgressTracker
33 LastIndex uint64
34}
35
36// EnterJoint verifies that the outgoing (=right) majority config of the joint
37// config is empty and initializes it with a copy of the incoming (=left)
38// majority config. That is, it transitions from
39//
40// (1 2 3)&&()
41// to
42// (1 2 3)&&(1 2 3).
43//
44// The supplied changes are then applied to the incoming majority config,
45// resulting in a joint configuration that in terms of the Raft thesis[1]
46// (Section 4.3) corresponds to `C_{new,old}`.
47//
48// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
49func (c Changer) EnterJoint(autoLeave bool, ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) {
50 cfg, prs, err := c.checkAndCopy()
51 if err != nil {
52 return c.err(err)
53 }
54 if joint(cfg) {
55 err := errors.New("config is already joint")
56 return c.err(err)
57 }
58 if len(incoming(cfg.Voters)) == 0 {
59 // We allow adding nodes to an empty config for convenience (testing and
60 // bootstrap), but you can't enter a joint state.
61 err := errors.New("can't make a zero-voter config joint")
62 return c.err(err)
63 }
64 // Clear the outgoing config.
65 *outgoingPtr(&cfg.Voters) = quorum.MajorityConfig{}
66 // Copy incoming to outgoing.
67 for id := range incoming(cfg.Voters) {
68 outgoing(cfg.Voters)[id] = struct{}{}
69 }
70
71 if err := c.apply(&cfg, prs, ccs...); err != nil {
72 return c.err(err)
73 }
74 cfg.AutoLeave = autoLeave
75 return checkAndReturn(cfg, prs)
76}
77
78// LeaveJoint transitions out of a joint configuration. It is an error to call
79// this method if the configuration is not joint, i.e. if the outgoing majority
80// config Voters[1] is empty.
81//
82// The outgoing majority config of the joint configuration will be removed,
83// that is, the incoming config is promoted as the sole decision maker. In the
84// notation of the Raft thesis[1] (Section 4.3), this method transitions from
85// `C_{new,old}` into `C_new`.
86//
87// At the same time, any staged learners (LearnersNext) the addition of which
88// was held back by an overlapping voter in the former outgoing config will be
89// inserted into Learners.
90//
91// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
92func (c Changer) LeaveJoint() (tracker.Config, tracker.ProgressMap, error) {
93 cfg, prs, err := c.checkAndCopy()
94 if err != nil {
95 return c.err(err)
96 }
97 if !joint(cfg) {
98 err := errors.New("can't leave a non-joint config")
99 return c.err(err)
100 }
101 if len(outgoing(cfg.Voters)) == 0 {
102 err := fmt.Errorf("configuration is not joint: %v", cfg)
103 return c.err(err)
104 }
105 for id := range cfg.LearnersNext {
106 nilAwareAdd(&cfg.Learners, id)
107 prs[id].IsLearner = true
108 }
109 cfg.LearnersNext = nil
110
111 for id := range outgoing(cfg.Voters) {
112 _, isVoter := incoming(cfg.Voters)[id]
113 _, isLearner := cfg.Learners[id]
114
115 if !isVoter && !isLearner {
116 delete(prs, id)
117 }
118 }
119 *outgoingPtr(&cfg.Voters) = nil
120 cfg.AutoLeave = false
121
122 return checkAndReturn(cfg, prs)
123}
124
125// Simple carries out a series of configuration changes that (in aggregate)
126// mutates the incoming majority config Voters[0] by at most one. This method
127// will return an error if that is not the case, if the resulting quorum is
128// zero, or if the configuration is in a joint state (i.e. if there is an
129// outgoing configuration).
130func (c Changer) Simple(ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) {
131 cfg, prs, err := c.checkAndCopy()
132 if err != nil {
133 return c.err(err)
134 }
135 if joint(cfg) {
136 err := errors.New("can't apply simple config change in joint config")
137 return c.err(err)
138 }
139 if err := c.apply(&cfg, prs, ccs...); err != nil {
140 return c.err(err)
141 }
142 if n := symdiff(incoming(c.Tracker.Voters), incoming(cfg.Voters)); n > 1 {
143 return tracker.Config{}, nil, errors.New("more than one voter changed without entering joint config")
144 }
145 if err := checkInvariants(cfg, prs); err != nil {
146 return tracker.Config{}, tracker.ProgressMap{}, nil
147 }
148
149 return checkAndReturn(cfg, prs)
150}
151
152// apply a change to the configuration. By convention, changes to voters are
153// always made to the incoming majority config Voters[0]. Voters[1] is either
154// empty or preserves the outgoing majority configuration while in a joint state.
155func (c Changer) apply(cfg *tracker.Config, prs tracker.ProgressMap, ccs ...pb.ConfChangeSingle) error {
156 for _, cc := range ccs {
157 if cc.NodeID == 0 {
158 // etcd replaces the NodeID with zero if it decides (downstream of
159 // raft) to not apply a change, so we have to have explicit code
160 // here to ignore these.
161 continue
162 }
163 switch cc.Type {
164 case pb.ConfChangeAddNode:
165 c.makeVoter(cfg, prs, cc.NodeID)
166 case pb.ConfChangeAddLearnerNode:
167 c.makeLearner(cfg, prs, cc.NodeID)
168 case pb.ConfChangeRemoveNode:
169 c.remove(cfg, prs, cc.NodeID)
170 case pb.ConfChangeUpdateNode:
171 default:
172 return fmt.Errorf("unexpected conf type %d", cc.Type)
173 }
174 }
175 if len(incoming(cfg.Voters)) == 0 {
176 return errors.New("removed all voters")
177 }
178 return nil
179}
180
181// makeVoter adds or promotes the given ID to be a voter in the incoming
182// majority config.
183func (c Changer) makeVoter(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
184 pr := prs[id]
185 if pr == nil {
186 c.initProgress(cfg, prs, id, false /* isLearner */)
187 return
188 }
189
190 pr.IsLearner = false
191 nilAwareDelete(&cfg.Learners, id)
192 nilAwareDelete(&cfg.LearnersNext, id)
193 incoming(cfg.Voters)[id] = struct{}{}
194 return
195}
196
197// makeLearner makes the given ID a learner or stages it to be a learner once
198// an active joint configuration is exited.
199//
200// The former happens when the peer is not a part of the outgoing config, in
201// which case we either add a new learner or demote a voter in the incoming
202// config.
203//
204// The latter case occurs when the configuration is joint and the peer is a
205// voter in the outgoing config. In that case, we do not want to add the peer
206// as a learner because then we'd have to track a peer as a voter and learner
207// simultaneously. Instead, we add the learner to LearnersNext, so that it will
208// be added to Learners the moment the outgoing config is removed by
209// LeaveJoint().
210func (c Changer) makeLearner(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
211 pr := prs[id]
212 if pr == nil {
213 c.initProgress(cfg, prs, id, true /* isLearner */)
214 return
215 }
216 if pr.IsLearner {
217 return
218 }
219 // Remove any existing voter in the incoming config...
220 c.remove(cfg, prs, id)
221 // ... but save the Progress.
222 prs[id] = pr
223 // Use LearnersNext if we can't add the learner to Learners directly, i.e.
224 // if the peer is still tracked as a voter in the outgoing config. It will
225 // be turned into a learner in LeaveJoint().
226 //
227 // Otherwise, add a regular learner right away.
228 if _, onRight := outgoing(cfg.Voters)[id]; onRight {
229 nilAwareAdd(&cfg.LearnersNext, id)
230 } else {
231 pr.IsLearner = true
232 nilAwareAdd(&cfg.Learners, id)
233 }
234}
235
236// remove this peer as a voter or learner from the incoming config.
237func (c Changer) remove(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
238 if _, ok := prs[id]; !ok {
239 return
240 }
241
242 delete(incoming(cfg.Voters), id)
243 nilAwareDelete(&cfg.Learners, id)
244 nilAwareDelete(&cfg.LearnersNext, id)
245
246 // If the peer is still a voter in the outgoing config, keep the Progress.
247 if _, onRight := outgoing(cfg.Voters)[id]; !onRight {
248 delete(prs, id)
249 }
250}
251
252// initProgress initializes a new progress for the given node or learner.
253func (c Changer) initProgress(cfg *tracker.Config, prs tracker.ProgressMap, id uint64, isLearner bool) {
254 if !isLearner {
255 incoming(cfg.Voters)[id] = struct{}{}
256 } else {
257 nilAwareAdd(&cfg.Learners, id)
258 }
259 prs[id] = &tracker.Progress{
260 // Initializing the Progress with the last index means that the follower
261 // can be probed (with the last index).
262 //
263 // TODO(tbg): seems awfully optimistic. Using the first index would be
264 // better. The general expectation here is that the follower has no log
265 // at all (and will thus likely need a snapshot), though the app may
266 // have applied a snapshot out of band before adding the replica (thus
267 // making the first index the better choice).
268 Next: c.LastIndex,
269 Match: 0,
270 Inflights: tracker.NewInflights(c.Tracker.MaxInflight),
271 IsLearner: isLearner,
272 // When a node is first added, we should mark it as recently active.
273 // Otherwise, CheckQuorum may cause us to step down if it is invoked
274 // before the added node has had a chance to communicate with us.
275 RecentActive: true,
276 }
277}
278
279// checkInvariants makes sure that the config and progress are compatible with
280// each other. This is used to check both what the Changer is initialized with,
281// as well as what it returns.
282func checkInvariants(cfg tracker.Config, prs tracker.ProgressMap) error {
283 // NB: intentionally allow the empty config. In production we'll never see a
284 // non-empty config (we prevent it from being created) but we will need to
285 // be able to *create* an initial config, for example during bootstrap (or
286 // during tests). Instead of having to hand-code this, we allow
287 // transitioning from an empty config into any other legal and non-empty
288 // config.
289 for _, ids := range []map[uint64]struct{}{
290 cfg.Voters.IDs(),
291 cfg.Learners,
292 cfg.LearnersNext,
293 } {
294 for id := range ids {
295 if _, ok := prs[id]; !ok {
296 return fmt.Errorf("no progress for %d", id)
297 }
298 }
299 }
300
301 // Any staged learner was staged because it could not be directly added due
302 // to a conflicting voter in the outgoing config.
303 for id := range cfg.LearnersNext {
304 if _, ok := outgoing(cfg.Voters)[id]; !ok {
305 return fmt.Errorf("%d is in LearnersNext, but not Voters[1]", id)
306 }
307 if prs[id].IsLearner {
308 return fmt.Errorf("%d is in LearnersNext, but is already marked as learner", id)
309 }
310 }
311 // Conversely Learners and Voters doesn't intersect at all.
312 for id := range cfg.Learners {
313 if _, ok := outgoing(cfg.Voters)[id]; ok {
314 return fmt.Errorf("%d is in Learners and Voters[1]", id)
315 }
316 if _, ok := incoming(cfg.Voters)[id]; ok {
317 return fmt.Errorf("%d is in Learners and Voters[0]", id)
318 }
319 if !prs[id].IsLearner {
320 return fmt.Errorf("%d is in Learners, but is not marked as learner", id)
321 }
322 }
323
324 if !joint(cfg) {
325 // We enforce that empty maps are nil instead of zero.
326 if outgoing(cfg.Voters) != nil {
327 return fmt.Errorf("Voters[1] must be nil when not joint")
328 }
329 if cfg.LearnersNext != nil {
330 return fmt.Errorf("LearnersNext must be nil when not joint")
331 }
332 if cfg.AutoLeave {
333 return fmt.Errorf("AutoLeave must be false when not joint")
334 }
335 }
336
337 return nil
338}
339
340// checkAndCopy copies the tracker's config and progress map (deeply enough for
341// the purposes of the Changer) and returns those copies. It returns an error
342// if checkInvariants does.
343func (c Changer) checkAndCopy() (tracker.Config, tracker.ProgressMap, error) {
344 cfg := c.Tracker.Config.Clone()
345 prs := tracker.ProgressMap{}
346
347 for id, pr := range c.Tracker.Progress {
348 // A shallow copy is enough because we only mutate the Learner field.
349 ppr := *pr
350 prs[id] = &ppr
351 }
352 return checkAndReturn(cfg, prs)
353}
354
355// checkAndReturn calls checkInvariants on the input and returns either the
356// resulting error or the input.
357func checkAndReturn(cfg tracker.Config, prs tracker.ProgressMap) (tracker.Config, tracker.ProgressMap, error) {
358 if err := checkInvariants(cfg, prs); err != nil {
359 return tracker.Config{}, tracker.ProgressMap{}, err
360 }
361 return cfg, prs, nil
362}
363
364// err returns zero values and an error.
365func (c Changer) err(err error) (tracker.Config, tracker.ProgressMap, error) {
366 return tracker.Config{}, nil, err
367}
368
369// nilAwareAdd populates a map entry, creating the map if necessary.
370func nilAwareAdd(m *map[uint64]struct{}, id uint64) {
371 if *m == nil {
372 *m = map[uint64]struct{}{}
373 }
374 (*m)[id] = struct{}{}
375}
376
377// nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after.
378func nilAwareDelete(m *map[uint64]struct{}, id uint64) {
379 if *m == nil {
380 return
381 }
382 delete(*m, id)
383 if len(*m) == 0 {
384 *m = nil
385 }
386}
387
388// symdiff returns the count of the symmetric difference between the sets of
389// uint64s, i.e. len( (l - r) \union (r - l)).
390func symdiff(l, r map[uint64]struct{}) int {
391 var n int
392 pairs := [][2]quorum.MajorityConfig{
393 {l, r}, // count elems in l but not in r
394 {r, l}, // count elems in r but not in l
395 }
396 for _, p := range pairs {
397 for id := range p[0] {
398 if _, ok := p[1][id]; !ok {
399 n++
400 }
401 }
402 }
403 return n
404}
405
406func joint(cfg tracker.Config) bool {
407 return len(outgoing(cfg.Voters)) > 0
408}
409
410func incoming(voters quorum.JointConfig) quorum.MajorityConfig { return voters[0] }
411func outgoing(voters quorum.JointConfig) quorum.MajorityConfig { return voters[1] }
412func outgoingPtr(voters *quorum.JointConfig) *quorum.MajorityConfig { return &voters[1] }
413
414// Describe prints the type and NodeID of the configuration changes as a
415// space-delimited string.
416func Describe(ccs ...pb.ConfChangeSingle) string {
417 var buf strings.Builder
418 for _, cc := range ccs {
419 if buf.Len() > 0 {
420 buf.WriteByte(' ')
421 }
422 fmt.Fprintf(&buf, "%s(%d)", cc.Type, cc.NodeID)
423 }
424 return buf.String()
425}