| Abhay Kumar | a61c522 | 2025-11-10 07:32:50 +0000 | [diff] [blame^] | 1 | // Copyright 2019 The etcd Authors |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | package confchange |
| 16 | |
| 17 | import ( |
| 18 | "errors" |
| 19 | "fmt" |
| 20 | "strings" |
| 21 | |
| 22 | "go.etcd.io/raft/v3/quorum" |
| 23 | pb "go.etcd.io/raft/v3/raftpb" |
| 24 | "go.etcd.io/raft/v3/tracker" |
| 25 | ) |
| 26 | |
| 27 | // Changer facilitates configuration changes. It exposes methods to handle |
| 28 | // simple and joint consensus while performing the proper validation that allows |
| 29 | // refusing invalid configuration changes before they affect the active |
| 30 | // configuration. |
| 31 | type Changer struct { |
| 32 | Tracker tracker.ProgressTracker |
| 33 | LastIndex uint64 |
| 34 | } |
| 35 | |
| 36 | // EnterJoint verifies that the outgoing (=right) majority config of the joint |
| 37 | // config is empty and initializes it with a copy of the incoming (=left) |
| 38 | // majority config. That is, it transitions from |
| 39 | // |
| 40 | // (1 2 3)&&() |
| 41 | // |
| 42 | // to |
| 43 | // |
| 44 | // (1 2 3)&&(1 2 3). |
| 45 | // |
| 46 | // The supplied changes are then applied to the incoming majority config, |
| 47 | // resulting in a joint configuration that in terms of the Raft thesis[1] |
| 48 | // (Section 4.3) corresponds to `C_{new,old}`. |
| 49 | // |
| 50 | // [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf |
| 51 | func (c Changer) EnterJoint(autoLeave bool, ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) { |
| 52 | cfg, trk, err := c.checkAndCopy() |
| 53 | if err != nil { |
| 54 | return c.err(err) |
| 55 | } |
| 56 | if joint(cfg) { |
| 57 | err := errors.New("config is already joint") |
| 58 | return c.err(err) |
| 59 | } |
| 60 | if len(incoming(cfg.Voters)) == 0 { |
| 61 | // We allow adding nodes to an empty config for convenience (testing and |
| 62 | // bootstrap), but you can't enter a joint state. |
| 63 | err := errors.New("can't make a zero-voter config joint") |
| 64 | return c.err(err) |
| 65 | } |
| 66 | // Clear the outgoing config. |
| 67 | *outgoingPtr(&cfg.Voters) = quorum.MajorityConfig{} |
| 68 | // Copy incoming to outgoing. |
| 69 | for id := range incoming(cfg.Voters) { |
| 70 | outgoing(cfg.Voters)[id] = struct{}{} |
| 71 | } |
| 72 | |
| 73 | if err := c.apply(&cfg, trk, ccs...); err != nil { |
| 74 | return c.err(err) |
| 75 | } |
| 76 | cfg.AutoLeave = autoLeave |
| 77 | return checkAndReturn(cfg, trk) |
| 78 | } |
| 79 | |
| 80 | // LeaveJoint transitions out of a joint configuration. It is an error to call |
| 81 | // this method if the configuration is not joint, i.e. if the outgoing majority |
| 82 | // config Voters[1] is empty. |
| 83 | // |
| 84 | // The outgoing majority config of the joint configuration will be removed, |
| 85 | // that is, the incoming config is promoted as the sole decision maker. In the |
| 86 | // notation of the Raft thesis[1] (Section 4.3), this method transitions from |
| 87 | // `C_{new,old}` into `C_new`. |
| 88 | // |
| 89 | // At the same time, any staged learners (LearnersNext) the addition of which |
| 90 | // was held back by an overlapping voter in the former outgoing config will be |
| 91 | // inserted into Learners. |
| 92 | // |
| 93 | // [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf |
| 94 | func (c Changer) LeaveJoint() (tracker.Config, tracker.ProgressMap, error) { |
| 95 | cfg, trk, err := c.checkAndCopy() |
| 96 | if err != nil { |
| 97 | return c.err(err) |
| 98 | } |
| 99 | if !joint(cfg) { |
| 100 | err := errors.New("can't leave a non-joint config") |
| 101 | return c.err(err) |
| 102 | } |
| 103 | for id := range cfg.LearnersNext { |
| 104 | nilAwareAdd(&cfg.Learners, id) |
| 105 | trk[id].IsLearner = true |
| 106 | } |
| 107 | cfg.LearnersNext = nil |
| 108 | |
| 109 | for id := range outgoing(cfg.Voters) { |
| 110 | _, isVoter := incoming(cfg.Voters)[id] |
| 111 | _, isLearner := cfg.Learners[id] |
| 112 | |
| 113 | if !isVoter && !isLearner { |
| 114 | delete(trk, id) |
| 115 | } |
| 116 | } |
| 117 | *outgoingPtr(&cfg.Voters) = nil |
| 118 | cfg.AutoLeave = false |
| 119 | |
| 120 | return checkAndReturn(cfg, trk) |
| 121 | } |
| 122 | |
| 123 | // Simple carries out a series of configuration changes that (in aggregate) |
| 124 | // mutates the incoming majority config Voters[0] by at most one. This method |
| 125 | // will return an error if that is not the case, if the resulting quorum is |
| 126 | // zero, or if the configuration is in a joint state (i.e. if there is an |
| 127 | // outgoing configuration). |
| 128 | func (c Changer) Simple(ccs ...pb.ConfChangeSingle) (tracker.Config, tracker.ProgressMap, error) { |
| 129 | cfg, trk, err := c.checkAndCopy() |
| 130 | if err != nil { |
| 131 | return c.err(err) |
| 132 | } |
| 133 | if joint(cfg) { |
| 134 | err := errors.New("can't apply simple config change in joint config") |
| 135 | return c.err(err) |
| 136 | } |
| 137 | if err := c.apply(&cfg, trk, ccs...); err != nil { |
| 138 | return c.err(err) |
| 139 | } |
| 140 | if n := symdiff(incoming(c.Tracker.Voters), incoming(cfg.Voters)); n > 1 { |
| 141 | return tracker.Config{}, nil, errors.New("more than one voter changed without entering joint config") |
| 142 | } |
| 143 | |
| 144 | return checkAndReturn(cfg, trk) |
| 145 | } |
| 146 | |
| 147 | // apply a change to the configuration. By convention, changes to voters are |
| 148 | // always made to the incoming majority config Voters[0]. Voters[1] is either |
| 149 | // empty or preserves the outgoing majority configuration while in a joint state. |
| 150 | func (c Changer) apply(cfg *tracker.Config, trk tracker.ProgressMap, ccs ...pb.ConfChangeSingle) error { |
| 151 | for _, cc := range ccs { |
| 152 | if cc.NodeID == 0 { |
| 153 | // etcd replaces the NodeID with zero if it decides (downstream of |
| 154 | // raft) to not apply a change, so we have to have explicit code |
| 155 | // here to ignore these. |
| 156 | continue |
| 157 | } |
| 158 | switch cc.Type { |
| 159 | case pb.ConfChangeAddNode: |
| 160 | c.makeVoter(cfg, trk, cc.NodeID) |
| 161 | case pb.ConfChangeAddLearnerNode: |
| 162 | c.makeLearner(cfg, trk, cc.NodeID) |
| 163 | case pb.ConfChangeRemoveNode: |
| 164 | c.remove(cfg, trk, cc.NodeID) |
| 165 | case pb.ConfChangeUpdateNode: |
| 166 | default: |
| 167 | return fmt.Errorf("unexpected conf type %d", cc.Type) |
| 168 | } |
| 169 | } |
| 170 | if len(incoming(cfg.Voters)) == 0 { |
| 171 | return errors.New("removed all voters") |
| 172 | } |
| 173 | return nil |
| 174 | } |
| 175 | |
| 176 | // makeVoter adds or promotes the given ID to be a voter in the incoming |
| 177 | // majority config. |
| 178 | func (c Changer) makeVoter(cfg *tracker.Config, trk tracker.ProgressMap, id uint64) { |
| 179 | pr := trk[id] |
| 180 | if pr == nil { |
| 181 | c.initProgress(cfg, trk, id, false /* isLearner */) |
| 182 | return |
| 183 | } |
| 184 | |
| 185 | pr.IsLearner = false |
| 186 | nilAwareDelete(&cfg.Learners, id) |
| 187 | nilAwareDelete(&cfg.LearnersNext, id) |
| 188 | incoming(cfg.Voters)[id] = struct{}{} |
| 189 | } |
| 190 | |
| 191 | // makeLearner makes the given ID a learner or stages it to be a learner once |
| 192 | // an active joint configuration is exited. |
| 193 | // |
| 194 | // The former happens when the peer is not a part of the outgoing config, in |
| 195 | // which case we either add a new learner or demote a voter in the incoming |
| 196 | // config. |
| 197 | // |
| 198 | // The latter case occurs when the configuration is joint and the peer is a |
| 199 | // voter in the outgoing config. In that case, we do not want to add the peer |
| 200 | // as a learner because then we'd have to track a peer as a voter and learner |
| 201 | // simultaneously. Instead, we add the learner to LearnersNext, so that it will |
| 202 | // be added to Learners the moment the outgoing config is removed by |
| 203 | // LeaveJoint(). |
| 204 | func (c Changer) makeLearner(cfg *tracker.Config, trk tracker.ProgressMap, id uint64) { |
| 205 | pr := trk[id] |
| 206 | if pr == nil { |
| 207 | c.initProgress(cfg, trk, id, true /* isLearner */) |
| 208 | return |
| 209 | } |
| 210 | if pr.IsLearner { |
| 211 | return |
| 212 | } |
| 213 | // Remove any existing voter in the incoming config... |
| 214 | c.remove(cfg, trk, id) |
| 215 | // ... but save the Progress. |
| 216 | trk[id] = pr |
| 217 | // Use LearnersNext if we can't add the learner to Learners directly, i.e. |
| 218 | // if the peer is still tracked as a voter in the outgoing config. It will |
| 219 | // be turned into a learner in LeaveJoint(). |
| 220 | // |
| 221 | // Otherwise, add a regular learner right away. |
| 222 | if _, onRight := outgoing(cfg.Voters)[id]; onRight { |
| 223 | nilAwareAdd(&cfg.LearnersNext, id) |
| 224 | } else { |
| 225 | pr.IsLearner = true |
| 226 | nilAwareAdd(&cfg.Learners, id) |
| 227 | } |
| 228 | } |
| 229 | |
| 230 | // remove this peer as a voter or learner from the incoming config. |
| 231 | func (c Changer) remove(cfg *tracker.Config, trk tracker.ProgressMap, id uint64) { |
| 232 | if _, ok := trk[id]; !ok { |
| 233 | return |
| 234 | } |
| 235 | |
| 236 | delete(incoming(cfg.Voters), id) |
| 237 | nilAwareDelete(&cfg.Learners, id) |
| 238 | nilAwareDelete(&cfg.LearnersNext, id) |
| 239 | |
| 240 | // If the peer is still a voter in the outgoing config, keep the Progress. |
| 241 | if _, onRight := outgoing(cfg.Voters)[id]; !onRight { |
| 242 | delete(trk, id) |
| 243 | } |
| 244 | } |
| 245 | |
| 246 | // initProgress initializes a new progress for the given node or learner. |
| 247 | func (c Changer) initProgress(cfg *tracker.Config, trk tracker.ProgressMap, id uint64, isLearner bool) { |
| 248 | if !isLearner { |
| 249 | incoming(cfg.Voters)[id] = struct{}{} |
| 250 | } else { |
| 251 | nilAwareAdd(&cfg.Learners, id) |
| 252 | } |
| 253 | trk[id] = &tracker.Progress{ |
| 254 | // Initializing the Progress with the last index means that the follower |
| 255 | // can be probed (with the last index). |
| 256 | // |
| 257 | // TODO(tbg): seems awfully optimistic. Using the first index would be |
| 258 | // better. The general expectation here is that the follower has no log |
| 259 | // at all (and will thus likely need a snapshot), though the app may |
| 260 | // have applied a snapshot out of band before adding the replica (thus |
| 261 | // making the first index the better choice). |
| 262 | Match: 0, |
| 263 | Next: max(c.LastIndex, 1), // invariant: Match < Next |
| 264 | Inflights: tracker.NewInflights(c.Tracker.MaxInflight, c.Tracker.MaxInflightBytes), |
| 265 | IsLearner: isLearner, |
| 266 | // When a node is first added, we should mark it as recently active. |
| 267 | // Otherwise, CheckQuorum may cause us to step down if it is invoked |
| 268 | // before the added node has had a chance to communicate with us. |
| 269 | RecentActive: true, |
| 270 | } |
| 271 | } |
| 272 | |
| 273 | // checkInvariants makes sure that the config and progress are compatible with |
| 274 | // each other. This is used to check both what the Changer is initialized with, |
| 275 | // as well as what it returns. |
| 276 | func checkInvariants(cfg tracker.Config, trk tracker.ProgressMap) error { |
| 277 | // NB: intentionally allow the empty config. In production we'll never see a |
| 278 | // non-empty config (we prevent it from being created) but we will need to |
| 279 | // be able to *create* an initial config, for example during bootstrap (or |
| 280 | // during tests). Instead of having to hand-code this, we allow |
| 281 | // transitioning from an empty config into any other legal and non-empty |
| 282 | // config. |
| 283 | for _, ids := range []map[uint64]struct{}{ |
| 284 | cfg.Voters.IDs(), |
| 285 | cfg.Learners, |
| 286 | cfg.LearnersNext, |
| 287 | } { |
| 288 | for id := range ids { |
| 289 | if _, ok := trk[id]; !ok { |
| 290 | return fmt.Errorf("no progress for %d", id) |
| 291 | } |
| 292 | } |
| 293 | } |
| 294 | |
| 295 | // Any staged learner was staged because it could not be directly added due |
| 296 | // to a conflicting voter in the outgoing config. |
| 297 | for id := range cfg.LearnersNext { |
| 298 | if _, ok := outgoing(cfg.Voters)[id]; !ok { |
| 299 | return fmt.Errorf("%d is in LearnersNext, but not Voters[1]", id) |
| 300 | } |
| 301 | if trk[id].IsLearner { |
| 302 | return fmt.Errorf("%d is in LearnersNext, but is already marked as learner", id) |
| 303 | } |
| 304 | } |
| 305 | // Conversely Learners and Voters doesn't intersect at all. |
| 306 | for id := range cfg.Learners { |
| 307 | if _, ok := outgoing(cfg.Voters)[id]; ok { |
| 308 | return fmt.Errorf("%d is in Learners and Voters[1]", id) |
| 309 | } |
| 310 | if _, ok := incoming(cfg.Voters)[id]; ok { |
| 311 | return fmt.Errorf("%d is in Learners and Voters[0]", id) |
| 312 | } |
| 313 | if !trk[id].IsLearner { |
| 314 | return fmt.Errorf("%d is in Learners, but is not marked as learner", id) |
| 315 | } |
| 316 | } |
| 317 | |
| 318 | if !joint(cfg) { |
| 319 | // We enforce that empty maps are nil instead of zero. |
| 320 | if outgoing(cfg.Voters) != nil { |
| 321 | return fmt.Errorf("cfg.Voters[1] must be nil when not joint") |
| 322 | } |
| 323 | if cfg.LearnersNext != nil { |
| 324 | return fmt.Errorf("cfg.LearnersNext must be nil when not joint") |
| 325 | } |
| 326 | if cfg.AutoLeave { |
| 327 | return fmt.Errorf("AutoLeave must be false when not joint") |
| 328 | } |
| 329 | } |
| 330 | |
| 331 | return nil |
| 332 | } |
| 333 | |
| 334 | // checkAndCopy copies the tracker's config and progress map (deeply enough for |
| 335 | // the purposes of the Changer) and returns those copies. It returns an error |
| 336 | // if checkInvariants does. |
| 337 | func (c Changer) checkAndCopy() (tracker.Config, tracker.ProgressMap, error) { |
| 338 | cfg := c.Tracker.Config.Clone() |
| 339 | trk := tracker.ProgressMap{} |
| 340 | |
| 341 | for id, pr := range c.Tracker.Progress { |
| 342 | // A shallow copy is enough because we only mutate the Learner field. |
| 343 | ppr := *pr |
| 344 | trk[id] = &ppr |
| 345 | } |
| 346 | return checkAndReturn(cfg, trk) |
| 347 | } |
| 348 | |
| 349 | // checkAndReturn calls checkInvariants on the input and returns either the |
| 350 | // resulting error or the input. |
| 351 | func checkAndReturn(cfg tracker.Config, trk tracker.ProgressMap) (tracker.Config, tracker.ProgressMap, error) { |
| 352 | if err := checkInvariants(cfg, trk); err != nil { |
| 353 | return tracker.Config{}, tracker.ProgressMap{}, err |
| 354 | } |
| 355 | return cfg, trk, nil |
| 356 | } |
| 357 | |
| 358 | // err returns zero values and an error. |
| 359 | func (c Changer) err(err error) (tracker.Config, tracker.ProgressMap, error) { |
| 360 | return tracker.Config{}, nil, err |
| 361 | } |
| 362 | |
| 363 | // nilAwareAdd populates a map entry, creating the map if necessary. |
| 364 | func nilAwareAdd(m *map[uint64]struct{}, id uint64) { |
| 365 | if *m == nil { |
| 366 | *m = map[uint64]struct{}{} |
| 367 | } |
| 368 | (*m)[id] = struct{}{} |
| 369 | } |
| 370 | |
| 371 | // nilAwareDelete deletes from a map, nil'ing the map itself if it is empty after. |
| 372 | func nilAwareDelete(m *map[uint64]struct{}, id uint64) { |
| 373 | if *m == nil { |
| 374 | return |
| 375 | } |
| 376 | delete(*m, id) |
| 377 | if len(*m) == 0 { |
| 378 | *m = nil |
| 379 | } |
| 380 | } |
| 381 | |
| 382 | // symdiff returns the count of the symmetric difference between the sets of |
| 383 | // uint64s, i.e. len( (l - r) \union (r - l)). |
| 384 | func symdiff(l, r map[uint64]struct{}) int { |
| 385 | var n int |
| 386 | pairs := [][2]quorum.MajorityConfig{ |
| 387 | {l, r}, // count elems in l but not in r |
| 388 | {r, l}, // count elems in r but not in l |
| 389 | } |
| 390 | for _, p := range pairs { |
| 391 | for id := range p[0] { |
| 392 | if _, ok := p[1][id]; !ok { |
| 393 | n++ |
| 394 | } |
| 395 | } |
| 396 | } |
| 397 | return n |
| 398 | } |
| 399 | |
| 400 | func joint(cfg tracker.Config) bool { |
| 401 | return len(outgoing(cfg.Voters)) > 0 |
| 402 | } |
| 403 | |
| 404 | func incoming(voters quorum.JointConfig) quorum.MajorityConfig { return voters[0] } |
| 405 | func outgoing(voters quorum.JointConfig) quorum.MajorityConfig { return voters[1] } |
| 406 | func outgoingPtr(voters *quorum.JointConfig) *quorum.MajorityConfig { return &voters[1] } |
| 407 | |
| 408 | // Describe prints the type and NodeID of the configuration changes as a |
| 409 | // space-delimited string. |
| 410 | func Describe(ccs ...pb.ConfChangeSingle) string { |
| 411 | var buf strings.Builder |
| 412 | for _, cc := range ccs { |
| 413 | if buf.Len() > 0 { |
| 414 | buf.WriteByte(' ') |
| 415 | } |
| 416 | fmt.Fprintf(&buf, "%s(%d)", cc.Type, cc.NodeID) |
| 417 | } |
| 418 | return buf.String() |
| 419 | } |