blob: fd75aedc8011657e64449e5a6c776c7240a2e7e8 [file] [log] [blame]
Devmalya Paulfb990a52019-07-09 10:01:49 -04001// Copyright 2019 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package confchange
16
17import (
18 "errors"
19 "fmt"
20 "strings"
21
22 "go.etcd.io/etcd/raft/quorum"
23 pb "go.etcd.io/etcd/raft/raftpb"
24 "go.etcd.io/etcd/raft/tracker"
25)
26
27// Changer facilitates configuration changes. It exposes methods to handle
28// simple and joint consensus while performing the proper validation that allows
29// refusing invalid configuration changes before they affect the active
30// configuration.
31type Changer struct {
32 Tracker tracker.ProgressTracker
33 LastIndex uint64
34}
35
36// EnterJoint verifies that the outgoing (=right) majority config of the joint
37// config is empty and initializes it with a copy of the incoming (=left)
38// majority config. That is, it transitions from
39//
40// (1 2 3)&&()
41// to
42// (1 2 3)&&(1 2 3).
43//
44// The supplied ConfChanges are then applied to the incoming majority config,
45// resulting in a joint configuration that in terms of the Raft thesis[1]
46// (Section 4.3) corresponds to `C_{new,old}`.
47//
48// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
49func (c Changer) EnterJoint(ccs ...pb.ConfChange) (tracker.Config, tracker.ProgressMap, error) {
50 cfg, prs, err := c.checkAndCopy()
51 if err != nil {
52 return c.err(err)
53 }
54 if joint(cfg) {
55 err := errors.New("config is already joint")
56 return c.err(err)
57 }
58 if len(incoming(cfg.Voters)) == 0 {
59 // We allow adding nodes to an empty config for convenience (testing and
60 // bootstrap), but you can't enter a joint state.
61 err := errors.New("can't make a zero-voter config joint")
62 return c.err(err)
63 }
64 // Clear the outgoing config.
65 {
66 *outgoingPtr(&cfg.Voters) = quorum.MajorityConfig{}
67
68 }
69 // Copy incoming to outgoing.
70 for id := range incoming(cfg.Voters) {
71 outgoing(cfg.Voters)[id] = struct{}{}
72 }
73
74 if err := c.apply(&cfg, prs, ccs...); err != nil {
75 return c.err(err)
76 }
77
78 return checkAndReturn(cfg, prs)
79}
80
81// LeaveJoint transitions out of a joint configuration. It is an error to call
82// this method if the configuration is not joint, i.e. if the outgoing majority
83// config Voters[1] is empty.
84//
85// The outgoing majority config of the joint configuration will be removed,
86// that is, the incoming config is promoted as the sole decision maker. In the
87// notation of the Raft thesis[1] (Section 4.3), this method transitions from
88// `C_{new,old}` into `C_new`.
89//
90// At the same time, any staged learners (LearnersNext) the addition of which
91// was held back by an overlapping voter in the former outgoing config will be
92// inserted into Learners.
93//
94// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
95func (c Changer) LeaveJoint() (tracker.Config, tracker.ProgressMap, error) {
96 cfg, prs, err := c.checkAndCopy()
97 if err != nil {
98 return c.err(err)
99 }
100 if !joint(cfg) {
101 err := errors.New("can't leave a non-joint config")
102 return c.err(err)
103 }
104 if len(outgoing(cfg.Voters)) == 0 {
105 err := fmt.Errorf("configuration is not joint: %v", cfg)
106 return c.err(err)
107 }
108 for id := range cfg.LearnersNext {
109 nilAwareAdd(&cfg.Learners, id)
110 prs[id].IsLearner = true
111 }
112 cfg.LearnersNext = nil
113
114 for id := range outgoing(cfg.Voters) {
115 _, isVoter := incoming(cfg.Voters)[id]
116 _, isLearner := cfg.Learners[id]
117
118 if !isVoter && !isLearner {
119 delete(prs, id)
120 }
121 }
122 *outgoingPtr(&cfg.Voters) = nil
123
124 return checkAndReturn(cfg, prs)
125}
126
127// Simple carries out a series of configuration changes that (in aggregate)
128// mutates the incoming majority config Voters[0] by at most one. This method
129// will return an error if that is not the case, if the resulting quorum is
130// zero, or if the configuration is in a joint state (i.e. if there is an
131// outgoing configuration).
132func (c Changer) Simple(ccs ...pb.ConfChange) (tracker.Config, tracker.ProgressMap, error) {
133 cfg, prs, err := c.checkAndCopy()
134 if err != nil {
135 return c.err(err)
136 }
137 if joint(cfg) {
138 err := errors.New("can't apply simple config change in joint config")
139 return c.err(err)
140 }
141 if err := c.apply(&cfg, prs, ccs...); err != nil {
142 return c.err(err)
143 }
144 if n := symdiff(incoming(c.Tracker.Voters), incoming(cfg.Voters)); n > 1 {
145 return tracker.Config{}, nil, errors.New("more than voter changed without entering joint config")
146 }
147 if err := checkInvariants(cfg, prs); err != nil {
148 return tracker.Config{}, tracker.ProgressMap{}, nil
149 }
150
151 return checkAndReturn(cfg, prs)
152}
153
154// apply a ConfChange to the configuration. By convention, changes to voters are
155// always made to the incoming majority config Voters[0]. Voters[1] is either
156// empty or preserves the outgoing majority configuration while in a joint state.
157func (c Changer) apply(cfg *tracker.Config, prs tracker.ProgressMap, ccs ...pb.ConfChange) error {
158 for _, cc := range ccs {
159 if cc.NodeID == 0 {
160 // etcd replaces the NodeID with zero if it decides (downstream of
161 // raft) to not apply a ConfChange, so we have to have explicit code
162 // here to ignore these.
163 continue
164 }
165 switch cc.Type {
166 case pb.ConfChangeAddNode:
167 c.makeVoter(cfg, prs, cc.NodeID)
168 case pb.ConfChangeAddLearnerNode:
169 c.makeLearner(cfg, prs, cc.NodeID)
170 case pb.ConfChangeRemoveNode:
171 c.remove(cfg, prs, cc.NodeID)
172 case pb.ConfChangeUpdateNode:
173 default:
174 return fmt.Errorf("unexpected conf type %d", cc.Type)
175 }
176 }
177 if len(incoming(cfg.Voters)) == 0 {
178 return errors.New("removed all voters")
179 }
180 return nil
181}
182
183// makeVoter adds or promotes the given ID to be a voter in the incoming
184// majority config.
185func (c Changer) makeVoter(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
186 pr := prs[id]
187 if pr == nil {
188 c.initProgress(cfg, prs, id, false /* isLearner */)
189 return
190 }
191
192 pr.IsLearner = false
193 nilAwareDelete(&cfg.Learners, id)
194 nilAwareDelete(&cfg.LearnersNext, id)
195 incoming(cfg.Voters)[id] = struct{}{}
196 return
197}
198
199// makeLearner makes the given ID a learner or stages it to be a learner once
200// an active joint configuration is exited.
201//
202// The former happens when the peer is not a part of the outgoing config, in
203// which case we either add a new learner or demote a voter in the incoming
204// config.
205//
206// The latter case occurs when the configuration is joint and the peer is a
207// voter in the outgoing config. In that case, we do not want to add the peer
208// as a learner because then we'd have to track a peer as a voter and learner
209// simultaneously. Instead, we add the learner to LearnersNext, so that it will
210// be added to Learners the moment the outgoing config is removed by
211// LeaveJoint().
212func (c Changer) makeLearner(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
213 pr := prs[id]
214 if pr == nil {
215 c.initProgress(cfg, prs, id, true /* isLearner */)
216 return
217 }
218 if pr.IsLearner {
219 return
220 }
221 // Remove any existing voter in the incoming config...
222 c.remove(cfg, prs, id)
223 // ... but save the Progress.
224 prs[id] = pr
225 // Use LearnersNext if we can't add the learner to Learners directly, i.e.
226 // if the peer is still tracked as a voter in the outgoing config. It will
227 // be turned into a learner in LeaveJoint().
228 //
229 // Otherwise, add a regular learner right away.
230 if _, onRight := outgoing(cfg.Voters)[id]; onRight {
231 nilAwareAdd(&cfg.LearnersNext, id)
232 } else {
233 pr.IsLearner = true
234 nilAwareAdd(&cfg.Learners, id)
235 }
236}
237
238// remove this peer as a voter or learner from the incoming config.
239func (c Changer) remove(cfg *tracker.Config, prs tracker.ProgressMap, id uint64) {
240 if _, ok := prs[id]; !ok {
241 return
242 }
243
244 delete(incoming(cfg.Voters), id)
245 nilAwareDelete(&cfg.Learners, id)
246 nilAwareDelete(&cfg.LearnersNext, id)
247
248 // If the peer is still a voter in the outgoing config, keep the Progress.
249 if _, onRight := outgoing(cfg.Voters)[id]; !onRight {
250 delete(prs, id)
251 }
252}
253
254// initProgress initializes a new progress for the given node or learner.
255func (c Changer) initProgress(cfg *tracker.Config, prs tracker.ProgressMap, id uint64, isLearner bool) {
256 if !isLearner {
257 incoming(cfg.Voters)[id] = struct{}{}
258 } else {
259 nilAwareAdd(&cfg.Learners, id)
260 }
261 prs[id] = &tracker.Progress{
262 // We initialize Progress.Next with lastIndex+1 so that the peer will be
263 // probed without an index first.
264 //
265 // TODO(tbg): verify that, this is just my best guess.
266 Next: c.LastIndex + 1,
267 Match: 0,
268 Inflights: tracker.NewInflights(c.Tracker.MaxInflight),
269 IsLearner: isLearner,
270 // When a node is first added, we should mark it as recently active.
271 // Otherwise, CheckQuorum may cause us to step down if it is invoked
272 // before the added node has had a chance to communicate with us.
273 RecentActive: true,
274 }
275}
276
277// checkInvariants makes sure that the config and progress are compatible with
278// each other. This is used to check both what the Changer is initialized with,
279// as well as what it returns.
280func checkInvariants(cfg tracker.Config, prs tracker.ProgressMap) error {
281 // NB: intentionally allow the empty config. In production we'll never see a
282 // non-empty config (we prevent it from being created) but we will need to
283 // be able to *create* an initial config, for example during bootstrap (or
284 // during tests). Instead of having to hand-code this, we allow
285 // transitioning from an empty config into any other legal and non-empty
286 // config.
287 for _, ids := range []map[uint64]struct{}{
288 cfg.Voters.IDs(),
289 cfg.Learners,
290 cfg.LearnersNext,
291 } {
292 for id := range ids {
293 if _, ok := prs[id]; !ok {
294 return fmt.Errorf("no progress for %d", id)
295 }
296 }
297 }
298
299 // Any staged learner was staged because it could not be directly added due
300 // to a conflicting voter in the outgoing config.
301 for id := range cfg.LearnersNext {
302 if _, ok := outgoing(cfg.Voters)[id]; !ok {
303 return fmt.Errorf("%d is in LearnersNext, but not Voters[1]", id)
304 }
305 if prs[id].IsLearner {
306 return fmt.Errorf("%d is in LearnersNext, but is already marked as learner", id)
307 }
308 }
309 // Conversely Learners and Voters doesn't intersect at all.
310 for id := range cfg.Learners {
311 if _, ok := outgoing(cfg.Voters)[id]; ok {
312 return fmt.Errorf("%d is in Learners and Voters[1]", id)
313 }
314 if _, ok := incoming(cfg.Voters)[id]; ok {
315 return fmt.Errorf("%d is in Learners and Voters[0]", id)
316 }
317 if !prs[id].IsLearner {
318 return fmt.Errorf("%d is in Learners, but is not marked as learner", id)
319 }
320 }
321
322 if !joint(cfg) {
323 // We enforce that empty maps are nil instead of zero.
324 if outgoing(cfg.Voters) != nil {
325 return fmt.Errorf("Voters[1] must be nil when not joint")
326 }
327 if cfg.LearnersNext != nil {
328 return fmt.Errorf("LearnersNext must be nil when not joint")
329 }
330 }
331
332 return nil
333}
334
335// checkAndCopy copies the tracker's config and progress map (deeply enough for
336// the purposes of the Changer) and returns those copies. It returns an error
337// if checkInvariants does.
338func (c Changer) checkAndCopy() (tracker.Config, tracker.ProgressMap, error) {
339 cfg := c.Tracker.Config.Clone()
340 prs := tracker.ProgressMap{}
341
342 for id, pr := range c.Tracker.Progress {
343 // A shallow copy is enough because we only mutate the Learner field.
344 ppr := *pr
345 prs[id] = &ppr
346 }
347 return checkAndReturn(cfg, prs)
348}
349
350// checkAndReturn calls checkInvariants on the input and returns either the
351// resulting error or the input.
352func checkAndReturn(cfg tracker.Config, prs tracker.ProgressMap) (tracker.Config, tracker.ProgressMap, error) {
353 if err := checkInvariants(cfg, prs); err != nil {
354 return tracker.Config{}, tracker.ProgressMap{}, err
355 }
356 return cfg, prs, nil
357}
358
359// err returns zero values and an error.
360func (c Changer) err(err error) (tracker.Config, tracker.ProgressMap, error) {
361 return tracker.Config{}, nil, err
362}
363
364// nilAwareAdd populates a map entry, creating the map if necessary.
365func nilAwareAdd(m *map[uint64]struct{}, id uint64) {
366 if *m == nil {
367 *m = map[uint64]struct{}{}
368 }
369 (*m)[id] = struct{}{}
370}
371
372// nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after.
373func nilAwareDelete(m *map[uint64]struct{}, id uint64) {
374 if *m == nil {
375 return
376 }
377 delete(*m, id)
378 if len(*m) == 0 {
379 *m = nil
380 }
381}
382
383// symdiff returns the count of the symmetric difference between the sets of
384// uint64s, i.e. len( (l - r) \union (r - l)).
385func symdiff(l, r map[uint64]struct{}) int {
386 var n int
387 pairs := [][2]quorum.MajorityConfig{
388 {l, r}, // count elems in l but not in r
389 {r, l}, // count elems in r but not in l
390 }
391 for _, p := range pairs {
392 for id := range p[0] {
393 if _, ok := p[1][id]; !ok {
394 n++
395 }
396 }
397 }
398 return n
399}
400
401func joint(cfg tracker.Config) bool {
402 return len(outgoing(cfg.Voters)) > 0
403}
404
405func incoming(voters quorum.JointConfig) quorum.MajorityConfig { return voters[0] }
406func outgoing(voters quorum.JointConfig) quorum.MajorityConfig { return voters[1] }
407func outgoingPtr(voters *quorum.JointConfig) *quorum.MajorityConfig { return &voters[1] }
408
409// Describe prints the type and NodeID of the configuration changes as a
410// space-delimited string.
411func Describe(ccs ...pb.ConfChange) string {
412 var buf strings.Builder
413 for _, cc := range ccs {
414 if buf.Len() > 0 {
415 buf.WriteByte(' ')
416 }
417 fmt.Fprintf(&buf, "%s(%d)", cc.Type, cc.NodeID)
418 }
419 return buf.String()
420}