blob: a73d5609aa450c74d7b3a5a04f9b34680ce79519 [file] [log] [blame]
Abhay Kumara61c5222025-11-10 07:32:50 +00001// Copyright 2019 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package confchange
16
17import (
18 "errors"
19 "fmt"
20 "strings"
21
22 "go.etcd.io/raft/v3/quorum"
23 pb "go.etcd.io/raft/v3/raftpb"
24 "go.etcd.io/raft/v3/tracker"
25)
26
27// Changer facilitates configuration changes. It exposes methods to handle
28// simple and joint consensus while performing the proper validation that allows
29// refusing invalid configuration changes before they affect the active
30// configuration.
31type Changer struct {
32 Tracker tracker.ProgressTracker
33 LastIndex uint64
34}
35
36// EnterJoint verifies that the outgoing (=right) majority config of the joint
37// config is empty and initializes it with a copy of the incoming (=left)
38// majority config. That is, it transitions from
39//
40// (1 2 3)&&()
41//
42// to
43//
44// (1 2 3)&&(1 2 3).
45//
46// The supplied changes are then applied to the incoming majority config,
47// resulting in a joint configuration that in terms of the Raft thesis[1]
48// (Section 4.3) corresponds to `C_{new,old}`.
49//
50// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
51func (c Changer) EnterJoint(autoLeave bool, ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) {
52 cfg, trk, err := c.checkAndCopy()
53 if err != nil {
54 return c.err(err)
55 }
56 if joint(cfg) {
57 err := errors.New("config is already joint")
58 return c.err(err)
59 }
60 if len(incoming(cfg.Voters)) == 0 {
61 // We allow adding nodes to an empty config for convenience (testing and
62 // bootstrap), but you can't enter a joint state.
63 err := errors.New("can't make a zero-voter config joint")
64 return c.err(err)
65 }
66 // Clear the outgoing config.
67 *outgoingPtr(&cfg.Voters) = quorum.MajorityConfig{}
68 // Copy incoming to outgoing.
69 for id := range incoming(cfg.Voters) {
70 outgoing(cfg.Voters)[id] = struct{}{}
71 }
72
73 if err := c.apply(&cfg, trk, ccs...); err != nil {
74 return c.err(err)
75 }
76 cfg.AutoLeave = autoLeave
77 return checkAndReturn(cfg, trk)
78}
79
80// LeaveJoint transitions out of a joint configuration. It is an error to call
81// this method if the configuration is not joint, i.e. if the outgoing majority
82// config Voters[1] is empty.
83//
84// The outgoing majority config of the joint configuration will be removed,
85// that is, the incoming config is promoted as the sole decision maker. In the
86// notation of the Raft thesis[1] (Section 4.3), this method transitions from
87// `C_{new,old}` into `C_new`.
88//
89// At the same time, any staged learners (LearnersNext) the addition of which
90// was held back by an overlapping voter in the former outgoing config will be
91// inserted into Learners.
92//
93// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
94func (c Changer) LeaveJoint() (tracker.Config, tracker.ProgressMap, error) {
95 cfg, trk, err := c.checkAndCopy()
96 if err != nil {
97 return c.err(err)
98 }
99 if !joint(cfg) {
100 err := errors.New("can't leave a non-joint config")
101 return c.err(err)
102 }
103 for id := range cfg.LearnersNext {
104 nilAwareAdd(&cfg.Learners, id)
105 trk[id].IsLearner = true
106 }
107 cfg.LearnersNext = nil
108
109 for id := range outgoing(cfg.Voters) {
110 _, isVoter := incoming(cfg.Voters)[id]
111 _, isLearner := cfg.Learners[id]
112
113 if !isVoter && !isLearner {
114 delete(trk, id)
115 }
116 }
117 *outgoingPtr(&cfg.Voters) = nil
118 cfg.AutoLeave = false
119
120 return checkAndReturn(cfg, trk)
121}
122
123// Simple carries out a series of configuration changes that (in aggregate)
124// mutates the incoming majority config Voters[0] by at most one. This method
125// will return an error if that is not the case, if the resulting quorum is
126// zero, or if the configuration is in a joint state (i.e. if there is an
127// outgoing configuration).
128func (c Changer) Simple(ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) {
129 cfg, trk, err := c.checkAndCopy()
130 if err != nil {
131 return c.err(err)
132 }
133 if joint(cfg) {
134 err := errors.New("can't apply simple config change in joint config")
135 return c.err(err)
136 }
137 if err := c.apply(&cfg, trk, ccs...); err != nil {
138 return c.err(err)
139 }
140 if n := symdiff(incoming(c.Tracker.Voters), incoming(cfg.Voters)); n > 1 {
141 return tracker.Config{}, nil, errors.New("more than one voter changed without entering joint config")
142 }
143
144 return checkAndReturn(cfg, trk)
145}
146
147// apply a change to the configuration. By convention, changes to voters are
148// always made to the incoming majority config Voters[0]. Voters[1] is either
149// empty or preserves the outgoing majority configuration while in a joint state.
150func (c Changer) apply(cfg *tracker.Config, trk tracker.ProgressMap, ccs ...pb.ConfChangeSingle) error {
151 for _, cc := range ccs {
152 if cc.NodeID == 0 {
153 // etcd replaces the NodeID with zero if it decides (downstream of
154 // raft) to not apply a change, so we have to have explicit code
155 // here to ignore these.
156 continue
157 }
158 switch cc.Type {
159 case pb.ConfChangeAddNode:
160 c.makeVoter(cfg, trk, cc.NodeID)
161 case pb.ConfChangeAddLearnerNode:
162 c.makeLearner(cfg, trk, cc.NodeID)
163 case pb.ConfChangeRemoveNode:
164 c.remove(cfg, trk, cc.NodeID)
165 case pb.ConfChangeUpdateNode:
166 default:
167 return fmt.Errorf("unexpected conf type %d", cc.Type)
168 }
169 }
170 if len(incoming(cfg.Voters)) == 0 {
171 return errors.New("removed all voters")
172 }
173 return nil
174}
175
176// makeVoter adds or promotes the given ID to be a voter in the incoming
177// majority config.
178func (c Changer) makeVoter(cfg *tracker.Config, trk tracker.ProgressMap, id uint64) {
179 pr := trk[id]
180 if pr == nil {
181 c.initProgress(cfg, trk, id, false /* isLearner */)
182 return
183 }
184
185 pr.IsLearner = false
186 nilAwareDelete(&cfg.Learners, id)
187 nilAwareDelete(&cfg.LearnersNext, id)
188 incoming(cfg.Voters)[id] = struct{}{}
189}
190
191// makeLearner makes the given ID a learner or stages it to be a learner once
192// an active joint configuration is exited.
193//
194// The former happens when the peer is not a part of the outgoing config, in
195// which case we either add a new learner or demote a voter in the incoming
196// config.
197//
198// The latter case occurs when the configuration is joint and the peer is a
199// voter in the outgoing config. In that case, we do not want to add the peer
200// as a learner because then we'd have to track a peer as a voter and learner
201// simultaneously. Instead, we add the learner to LearnersNext, so that it will
202// be added to Learners the moment the outgoing config is removed by
203// LeaveJoint().
204func (c Changer) makeLearner(cfg *tracker.Config, trk tracker.ProgressMap, id uint64) {
205 pr := trk[id]
206 if pr == nil {
207 c.initProgress(cfg, trk, id, true /* isLearner */)
208 return
209 }
210 if pr.IsLearner {
211 return
212 }
213 // Remove any existing voter in the incoming config...
214 c.remove(cfg, trk, id)
215 // ... but save the Progress.
216 trk[id] = pr
217 // Use LearnersNext if we can't add the learner to Learners directly, i.e.
218 // if the peer is still tracked as a voter in the outgoing config. It will
219 // be turned into a learner in LeaveJoint().
220 //
221 // Otherwise, add a regular learner right away.
222 if _, onRight := outgoing(cfg.Voters)[id]; onRight {
223 nilAwareAdd(&cfg.LearnersNext, id)
224 } else {
225 pr.IsLearner = true
226 nilAwareAdd(&cfg.Learners, id)
227 }
228}
229
230// remove this peer as a voter or learner from the incoming config.
231func (c Changer) remove(cfg *tracker.Config, trk tracker.ProgressMap, id uint64) {
232 if _, ok := trk[id]; !ok {
233 return
234 }
235
236 delete(incoming(cfg.Voters), id)
237 nilAwareDelete(&cfg.Learners, id)
238 nilAwareDelete(&cfg.LearnersNext, id)
239
240 // If the peer is still a voter in the outgoing config, keep the Progress.
241 if _, onRight := outgoing(cfg.Voters)[id]; !onRight {
242 delete(trk, id)
243 }
244}
245
246// initProgress initializes a new progress for the given node or learner.
247func (c Changer) initProgress(cfg *tracker.Config, trk tracker.ProgressMap, id uint64, isLearner bool) {
248 if !isLearner {
249 incoming(cfg.Voters)[id] = struct{}{}
250 } else {
251 nilAwareAdd(&cfg.Learners, id)
252 }
253 trk[id] = &tracker.Progress{
254 // Initializing the Progress with the last index means that the follower
255 // can be probed (with the last index).
256 //
257 // TODO(tbg): seems awfully optimistic. Using the first index would be
258 // better. The general expectation here is that the follower has no log
259 // at all (and will thus likely need a snapshot), though the app may
260 // have applied a snapshot out of band before adding the replica (thus
261 // making the first index the better choice).
262 Match: 0,
263 Next: max(c.LastIndex, 1), // invariant: Match < Next
264 Inflights: tracker.NewInflights(c.Tracker.MaxInflight, c.Tracker.MaxInflightBytes),
265 IsLearner: isLearner,
266 // When a node is first added, we should mark it as recently active.
267 // Otherwise, CheckQuorum may cause us to step down if it is invoked
268 // before the added node has had a chance to communicate with us.
269 RecentActive: true,
270 }
271}
272
273// checkInvariants makes sure that the config and progress are compatible with
274// each other. This is used to check both what the Changer is initialized with,
275// as well as what it returns.
276func checkInvariants(cfg tracker.Config, trk tracker.ProgressMap) error {
277 // NB: intentionally allow the empty config. In production we'll never see a
278 // non-empty config (we prevent it from being created) but we will need to
279 // be able to *create* an initial config, for example during bootstrap (or
280 // during tests). Instead of having to hand-code this, we allow
281 // transitioning from an empty config into any other legal and non-empty
282 // config.
283 for _, ids := range []map[uint64]struct{}{
284 cfg.Voters.IDs(),
285 cfg.Learners,
286 cfg.LearnersNext,
287 } {
288 for id := range ids {
289 if _, ok := trk[id]; !ok {
290 return fmt.Errorf("no progress for %d", id)
291 }
292 }
293 }
294
295 // Any staged learner was staged because it could not be directly added due
296 // to a conflicting voter in the outgoing config.
297 for id := range cfg.LearnersNext {
298 if _, ok := outgoing(cfg.Voters)[id]; !ok {
299 return fmt.Errorf("%d is in LearnersNext, but not Voters[1]", id)
300 }
301 if trk[id].IsLearner {
302 return fmt.Errorf("%d is in LearnersNext, but is already marked as learner", id)
303 }
304 }
305 // Conversely Learners and Voters doesn't intersect at all.
306 for id := range cfg.Learners {
307 if _, ok := outgoing(cfg.Voters)[id]; ok {
308 return fmt.Errorf("%d is in Learners and Voters[1]", id)
309 }
310 if _, ok := incoming(cfg.Voters)[id]; ok {
311 return fmt.Errorf("%d is in Learners and Voters[0]", id)
312 }
313 if !trk[id].IsLearner {
314 return fmt.Errorf("%d is in Learners, but is not marked as learner", id)
315 }
316 }
317
318 if !joint(cfg) {
319 // We enforce that empty maps are nil instead of zero.
320 if outgoing(cfg.Voters) != nil {
321 return fmt.Errorf("cfg.Voters[1] must be nil when not joint")
322 }
323 if cfg.LearnersNext != nil {
324 return fmt.Errorf("cfg.LearnersNext must be nil when not joint")
325 }
326 if cfg.AutoLeave {
327 return fmt.Errorf("AutoLeave must be false when not joint")
328 }
329 }
330
331 return nil
332}
333
334// checkAndCopy copies the tracker's config and progress map (deeply enough for
335// the purposes of the Changer) and returns those copies. It returns an error
336// if checkInvariants does.
337func (c Changer) checkAndCopy() (tracker.Config, tracker.ProgressMap, error) {
338 cfg := c.Tracker.Config.Clone()
339 trk := tracker.ProgressMap{}
340
341 for id, pr := range c.Tracker.Progress {
342 // A shallow copy is enough because we only mutate the Learner field.
343 ppr := *pr
344 trk[id] = &ppr
345 }
346 return checkAndReturn(cfg, trk)
347}
348
349// checkAndReturn calls checkInvariants on the input and returns either the
350// resulting error or the input.
351func checkAndReturn(cfg tracker.Config, trk tracker.ProgressMap) (tracker.Config, tracker.ProgressMap, error) {
352 if err := checkInvariants(cfg, trk); err != nil {
353 return tracker.Config{}, tracker.ProgressMap{}, err
354 }
355 return cfg, trk, nil
356}
357
358// err returns zero values and an error.
359func (c Changer) err(err error) (tracker.Config, tracker.ProgressMap, error) {
360 return tracker.Config{}, nil, err
361}
362
363// nilAwareAdd populates a map entry, creating the map if necessary.
364func nilAwareAdd(m *map[uint64]struct{}, id uint64) {
365 if *m == nil {
366 *m = map[uint64]struct{}{}
367 }
368 (*m)[id] = struct{}{}
369}
370
371// nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after.
372func nilAwareDelete(m *map[uint64]struct{}, id uint64) {
373 if *m == nil {
374 return
375 }
376 delete(*m, id)
377 if len(*m) == 0 {
378 *m = nil
379 }
380}
381
382// symdiff returns the count of the symmetric difference between the sets of
383// uint64s, i.e. len( (l - r) \union (r - l)).
384func symdiff(l, r map[uint64]struct{}) int {
385 var n int
386 pairs := [][2]quorum.MajorityConfig{
387 {l, r}, // count elems in l but not in r
388 {r, l}, // count elems in r but not in l
389 }
390 for _, p := range pairs {
391 for id := range p[0] {
392 if _, ok := p[1][id]; !ok {
393 n++
394 }
395 }
396 }
397 return n
398}
399
400func joint(cfg tracker.Config) bool {
401 return len(outgoing(cfg.Voters)) > 0
402}
403
404func incoming(voters quorum.JointConfig) quorum.MajorityConfig { return voters[0] }
405func outgoing(voters quorum.JointConfig) quorum.MajorityConfig { return voters[1] }
406func outgoingPtr(voters *quorum.JointConfig) *quorum.MajorityConfig { return &voters[1] }
407
408// Describe prints the type and NodeID of the configuration changes as a
409// space-delimited string.
410func Describe(ccs ...pb.ConfChangeSingle) string {
411 var buf strings.Builder
412 for _, cc := range ccs {
413 if buf.Len() > 0 {
414 buf.WriteByte(' ')
415 }
416 fmt.Fprintf(&buf, "%s(%d)", cc.Type, cc.NodeID)
417 }
418 return buf.String()
419}