blob: a4da2ae2e23c9e0816f10666eb29e90f87e4b0cc [file] [log] [blame]
Abhay Kumar40252eb2025-10-13 13:25:53 +00001// Copyright 2015 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package raft
16
17import (
18 "errors"
19
20 pb "go.etcd.io/raft/v3/raftpb"
21 "go.etcd.io/raft/v3/tracker"
22)
23
24// ErrStepLocalMsg is returned when try to step a local raft message
25var ErrStepLocalMsg = errors.New("raft: cannot step raft local message")
26
27// ErrStepPeerNotFound is returned when try to step a response message
28// but there is no peer found in raft.trk for that node.
29var ErrStepPeerNotFound = errors.New("raft: cannot step as peer not found")
30
31// RawNode is a thread-unsafe Node.
32// The methods of this struct correspond to the methods of Node and are described
33// more fully there.
34type RawNode struct {
35 raft *raft
36 asyncStorageWrites bool
37
38 // Mutable fields.
39 prevSoftSt *SoftState
40 prevHardSt pb.HardState
41 stepsOnAdvance []pb.Message
42}
43
44// NewRawNode instantiates a RawNode from the given configuration.
45//
46// See Bootstrap() for bootstrapping an initial state; this replaces the former
47// 'peers' argument to this method (with identical behavior). However, It is
48// recommended that instead of calling Bootstrap, applications bootstrap their
49// state manually by setting up a Storage that has a first index > 1 and which
50// stores the desired ConfState as its InitialState.
51func NewRawNode(config *Config) (*RawNode, error) {
52 r := newRaft(config)
53 rn := &RawNode{
54 raft: r,
55 }
56 rn.asyncStorageWrites = config.AsyncStorageWrites
57 ss := r.softState()
58 rn.prevSoftSt = &ss
59 rn.prevHardSt = r.hardState()
60 return rn, nil
61}
62
63// Tick advances the internal logical clock by a single tick.
64func (rn *RawNode) Tick() {
65 rn.raft.tick()
66}
67
68// TickQuiesced advances the internal logical clock by a single tick without
69// performing any other state machine processing. It allows the caller to avoid
70// periodic heartbeats and elections when all of the peers in a Raft group are
71// known to be at the same state. Expected usage is to periodically invoke Tick
72// or TickQuiesced depending on whether the group is "active" or "quiesced".
73//
74// WARNING: Be very careful about using this method as it subverts the Raft
75// state machine. You should probably be using Tick instead.
76//
77// DEPRECATED: This method will be removed in a future release.
78func (rn *RawNode) TickQuiesced() {
79 rn.raft.electionElapsed++
80}
81
82// Campaign causes this RawNode to transition to candidate state.
83func (rn *RawNode) Campaign() error {
84 return rn.raft.Step(pb.Message{
85 Type: pb.MsgHup,
86 })
87}
88
89// Propose proposes data be appended to the raft log.
90func (rn *RawNode) Propose(data []byte) error {
91 return rn.raft.Step(pb.Message{
92 Type: pb.MsgProp,
93 From: rn.raft.id,
94 Entries: []pb.Entry{
95 {Data: data},
96 }})
97}
98
99// ProposeConfChange proposes a config change. See (Node).ProposeConfChange for
100// details.
101func (rn *RawNode) ProposeConfChange(cc pb.ConfChangeI) error {
102 m, err := confChangeToMsg(cc)
103 if err != nil {
104 return err
105 }
106 return rn.raft.Step(m)
107}
108
109// ApplyConfChange applies a config change to the local node. The app must call
110// this when it applies a configuration change, except when it decides to reject
111// the configuration change, in which case no call must take place.
112func (rn *RawNode) ApplyConfChange(cc pb.ConfChangeI) *pb.ConfState {
113 cs := rn.raft.applyConfChange(cc.AsV2())
114 return &cs
115}
116
117// Step advances the state machine using the given message.
118func (rn *RawNode) Step(m pb.Message) error {
119 // Ignore unexpected local messages receiving over network.
120 if IsLocalMsg(m.Type) && !IsLocalMsgTarget(m.From) {
121 return ErrStepLocalMsg
122 }
123 if IsResponseMsg(m.Type) && !IsLocalMsgTarget(m.From) && rn.raft.trk.Progress[m.From] == nil {
124 return ErrStepPeerNotFound
125 }
126 return rn.raft.Step(m)
127}
128
129// Ready returns the outstanding work that the application needs to handle. This
130// includes appending and applying entries or a snapshot, updating the HardState,
131// and sending messages. The returned Ready() *must* be handled and subsequently
132// passed back via Advance().
133func (rn *RawNode) Ready() Ready {
134 rd := rn.readyWithoutAccept()
135 rn.acceptReady(rd)
136 return rd
137}
138
139// readyWithoutAccept returns a Ready. This is a read-only operation, i.e. there
140// is no obligation that the Ready must be handled.
141func (rn *RawNode) readyWithoutAccept() Ready {
142 r := rn.raft
143
144 rd := Ready{
145 Entries: r.raftLog.nextUnstableEnts(),
146 CommittedEntries: r.raftLog.nextCommittedEnts(rn.applyUnstableEntries()),
147 Messages: r.msgs,
148 }
149 if softSt := r.softState(); !softSt.equal(rn.prevSoftSt) {
150 // Allocate only when SoftState changes.
151 escapingSoftSt := softSt
152 rd.SoftState = &escapingSoftSt
153 }
154 if hardSt := r.hardState(); !isHardStateEqual(hardSt, rn.prevHardSt) {
155 rd.HardState = hardSt
156 }
157 if r.raftLog.hasNextUnstableSnapshot() {
158 rd.Snapshot = *r.raftLog.nextUnstableSnapshot()
159 }
160 if len(r.readStates) != 0 {
161 rd.ReadStates = r.readStates
162 }
163 rd.MustSync = MustSync(r.hardState(), rn.prevHardSt, len(rd.Entries))
164
165 if rn.asyncStorageWrites {
166 // If async storage writes are enabled, enqueue messages to
167 // local storage threads, where applicable.
168 if needStorageAppendMsg(r, rd) {
169 m := newStorageAppendMsg(r, rd)
170 rd.Messages = append(rd.Messages, m)
171 }
172 if needStorageApplyMsg(rd) {
173 m := newStorageApplyMsg(r, rd)
174 rd.Messages = append(rd.Messages, m)
175 }
176 } else {
177 // If async storage writes are disabled, immediately enqueue
178 // msgsAfterAppend to be sent out. The Ready struct contract
179 // mandates that Messages cannot be sent until after Entries
180 // are written to stable storage.
181 for _, m := range r.msgsAfterAppend {
182 if m.To != r.id {
183 rd.Messages = append(rd.Messages, m)
184 }
185 }
186 }
187
188 return rd
189}
190
191// MustSync returns true if the hard state and count of Raft entries indicate
192// that a synchronous write to persistent storage is required.
193func MustSync(st, prevst pb.HardState, entsnum int) bool {
194 // Persistent state on all servers:
195 // (Updated on stable storage before responding to RPCs)
196 // currentTerm
197 // votedFor
198 // log entries[]
199 return entsnum != 0 || st.Vote != prevst.Vote || st.Term != prevst.Term
200}
201
202func needStorageAppendMsg(r *raft, rd Ready) bool {
203 // Return true if log entries, hard state, or a snapshot need to be written
204 // to stable storage. Also return true if any messages are contingent on all
205 // prior MsgStorageAppend being processed.
206 return len(rd.Entries) > 0 ||
207 !IsEmptyHardState(rd.HardState) ||
208 !IsEmptySnap(rd.Snapshot) ||
209 len(r.msgsAfterAppend) > 0
210}
211
212func needStorageAppendRespMsg(r *raft, rd Ready) bool {
213 // Return true if raft needs to hear about stabilized entries or an applied
214 // snapshot. See the comment in newStorageAppendRespMsg, which explains why
215 // we check hasNextOrInProgressUnstableEnts instead of len(rd.Entries) > 0.
216 return r.raftLog.hasNextOrInProgressUnstableEnts() ||
217 !IsEmptySnap(rd.Snapshot)
218}
219
220// newStorageAppendMsg creates the message that should be sent to the local
221// append thread to instruct it to append log entries, write an updated hard
222// state, and apply a snapshot. The message also carries a set of responses
223// that should be delivered after the rest of the message is processed. Used
224// with AsyncStorageWrites.
225func newStorageAppendMsg(r *raft, rd Ready) pb.Message {
226 m := pb.Message{
227 Type: pb.MsgStorageAppend,
228 To: LocalAppendThread,
229 From: r.id,
230 Entries: rd.Entries,
231 }
232 if !IsEmptyHardState(rd.HardState) {
233 // If the Ready includes a HardState update, assign each of its fields
234 // to the corresponding fields in the Message. This allows clients to
235 // reconstruct the HardState and save it to stable storage.
236 //
237 // If the Ready does not include a HardState update, make sure to not
238 // assign a value to any of the fields so that a HardState reconstructed
239 // from them will be empty (return true from raft.IsEmptyHardState).
240 m.Term = rd.Term
241 m.Vote = rd.Vote
242 m.Commit = rd.Commit
243 }
244 if !IsEmptySnap(rd.Snapshot) {
245 snap := rd.Snapshot
246 m.Snapshot = &snap
247 }
248 // Attach all messages in msgsAfterAppend as responses to be delivered after
249 // the message is processed, along with a self-directed MsgStorageAppendResp
250 // to acknowledge the entry stability.
251 //
252 // NB: it is important for performance that MsgStorageAppendResp message be
253 // handled after self-directed MsgAppResp messages on the leader (which will
254 // be contained in msgsAfterAppend). This ordering allows the MsgAppResp
255 // handling to use a fast-path in r.raftLog.term() before the newly appended
256 // entries are removed from the unstable log.
257 m.Responses = r.msgsAfterAppend
258 if needStorageAppendRespMsg(r, rd) {
259 m.Responses = append(m.Responses, newStorageAppendRespMsg(r, rd))
260 }
261 return m
262}
263
264// newStorageAppendRespMsg creates the message that should be returned to node
265// after the unstable log entries, hard state, and snapshot in the current Ready
266// (along with those in all prior Ready structs) have been saved to stable
267// storage.
268func newStorageAppendRespMsg(r *raft, rd Ready) pb.Message {
269 m := pb.Message{
270 Type: pb.MsgStorageAppendResp,
271 To: r.id,
272 From: LocalAppendThread,
273 // Dropped after term change, see below.
274 Term: r.Term,
275 }
276 if r.raftLog.hasNextOrInProgressUnstableEnts() {
277 // If the raft log has unstable entries, attach the last index and term of the
278 // append to the response message. This (index, term) tuple will be handed back
279 // and consulted when the stability of those log entries is signaled to the
280 // unstable. If the (index, term) match the unstable log by the time the
281 // response is received (unstable.stableTo), the unstable log can be truncated.
282 //
283 // However, with just this logic, there would be an ABA problem[^1] that could
284 // lead to the unstable log and the stable log getting out of sync temporarily
285 // and leading to an inconsistent view. Consider the following example with 5
286 // nodes, A B C D E:
287 //
288 // 1. A is the leader.
289 // 2. A proposes some log entries but only B receives these entries.
290 // 3. B gets the Ready and the entries are appended asynchronously.
291 // 4. A crashes and C becomes leader after getting a vote from D and E.
292 // 5. C proposes some log entries and B receives these entries, overwriting the
293 // previous unstable log entries that are in the process of being appended.
294 // The entries have a larger term than the previous entries but the same
295 // indexes. It begins appending these new entries asynchronously.
296 // 6. C crashes and A restarts and becomes leader again after getting the vote
297 // from D and E.
298 // 7. B receives the entries from A which are the same as the ones from step 2,
299 // overwriting the previous unstable log entries that are in the process of
300 // being appended from step 5. The entries have the original terms and
301 // indexes from step 2. Recall that log entries retain their original term
302 // numbers when a leader replicates entries from previous terms. It begins
303 // appending these new entries asynchronously.
304 // 8. The asynchronous log appends from the first Ready complete and stableTo
305 // is called.
306 // 9. However, the log entries from the second Ready are still in the
307 // asynchronous append pipeline and will overwrite (in stable storage) the
308 // entries from the first Ready at some future point. We can't truncate the
309 // unstable log yet or a future read from Storage might see the entries from
310 // step 5 before they have been replaced by the entries from step 7.
311 // Instead, we must wait until we are sure that the entries are stable and
312 // that no in-progress appends might overwrite them before removing entries
313 // from the unstable log.
314 //
315 // To prevent these kinds of problems, we also attach the current term to the
316 // MsgStorageAppendResp (above). If the term has changed by the time the
317 // MsgStorageAppendResp if returned, the response is ignored and the unstable
318 // log is not truncated. The unstable log is only truncated when the term has
319 // remained unchanged from the time that the MsgStorageAppend was sent to the
320 // time that the MsgStorageAppendResp is received, indicating that no-one else
321 // is in the process of truncating the stable log.
322 //
323 // However, this replaces a correctness problem with a liveness problem. If we
324 // only attempted to truncate the unstable log when appending new entries but
325 // also occasionally dropped these responses, then quiescence of new log entries
326 // could lead to the unstable log never being truncated.
327 //
328 // To combat this, we attempt to truncate the log on all MsgStorageAppendResp
329 // messages where the unstable log is not empty, not just those associated with
330 // entry appends. This includes MsgStorageAppendResp messages associated with an
331 // updated HardState, which occur after a term change.
332 //
333 // In other words, we set Index and LogTerm in a block that looks like:
334 //
335 // if r.raftLog.hasNextOrInProgressUnstableEnts() { ... }
336 //
337 // not like:
338 //
339 // if len(rd.Entries) > 0 { ... }
340 //
341 // To do so, we attach r.raftLog.lastIndex() and r.raftLog.lastTerm(), not the
342 // (index, term) of the last entry in rd.Entries. If rd.Entries is not empty,
343 // these will be the same. However, if rd.Entries is empty, we still want to
344 // attest that this (index, term) is correct at the current term, in case the
345 // MsgStorageAppend that contained the last entry in the unstable slice carried
346 // an earlier term and was dropped.
347 //
348 // A MsgStorageAppend with a new term is emitted on each term change. This is
349 // the same condition that causes MsgStorageAppendResp messages with earlier
350 // terms to be ignored. As a result, we are guaranteed that, assuming a bounded
351 // number of term changes, there will eventually be a MsgStorageAppendResp
352 // message that is not ignored. This means that entries in the unstable log
353 // which have been appended to stable storage will eventually be truncated and
354 // dropped from memory.
355 //
356 // [^1]: https://en.wikipedia.org/wiki/ABA_problem
357 last := r.raftLog.lastEntryID()
358 m.Index = last.index
359 m.LogTerm = last.term
360 }
361 if !IsEmptySnap(rd.Snapshot) {
362 snap := rd.Snapshot
363 m.Snapshot = &snap
364 }
365 return m
366}
367
368func needStorageApplyMsg(rd Ready) bool { return len(rd.CommittedEntries) > 0 }
369func needStorageApplyRespMsg(rd Ready) bool { return needStorageApplyMsg(rd) }
370
371// newStorageApplyMsg creates the message that should be sent to the local
372// apply thread to instruct it to apply committed log entries. The message
373// also carries a response that should be delivered after the rest of the
374// message is processed. Used with AsyncStorageWrites.
375func newStorageApplyMsg(r *raft, rd Ready) pb.Message {
376 ents := rd.CommittedEntries
377 return pb.Message{
378 Type: pb.MsgStorageApply,
379 To: LocalApplyThread,
380 From: r.id,
381 Term: 0, // committed entries don't apply under a specific term
382 Entries: ents,
383 Responses: []pb.Message{
384 newStorageApplyRespMsg(r, ents),
385 },
386 }
387}
388
389// newStorageApplyRespMsg creates the message that should be returned to node
390// after the committed entries in the current Ready (along with those in all
391// prior Ready structs) have been applied to the local state machine.
392func newStorageApplyRespMsg(r *raft, ents []pb.Entry) pb.Message {
393 return pb.Message{
394 Type: pb.MsgStorageApplyResp,
395 To: r.id,
396 From: LocalApplyThread,
397 Term: 0, // committed entries don't apply under a specific term
398 Entries: ents,
399 }
400}
401
402// acceptReady is called when the consumer of the RawNode has decided to go
403// ahead and handle a Ready. Nothing must alter the state of the RawNode between
404// this call and the prior call to Ready().
405func (rn *RawNode) acceptReady(rd Ready) {
406 if rd.SoftState != nil {
407 rn.prevSoftSt = rd.SoftState
408 }
409 if !IsEmptyHardState(rd.HardState) {
410 rn.prevHardSt = rd.HardState
411 }
412 if len(rd.ReadStates) != 0 {
413 rn.raft.readStates = nil
414 }
415 if !rn.asyncStorageWrites {
416 if len(rn.stepsOnAdvance) != 0 {
417 rn.raft.logger.Panicf("two accepted Ready structs without call to Advance")
418 }
419 for _, m := range rn.raft.msgsAfterAppend {
420 if m.To == rn.raft.id {
421 rn.stepsOnAdvance = append(rn.stepsOnAdvance, m)
422 }
423 }
424 if needStorageAppendRespMsg(rn.raft, rd) {
425 m := newStorageAppendRespMsg(rn.raft, rd)
426 rn.stepsOnAdvance = append(rn.stepsOnAdvance, m)
427 }
428 if needStorageApplyRespMsg(rd) {
429 m := newStorageApplyRespMsg(rn.raft, rd.CommittedEntries)
430 rn.stepsOnAdvance = append(rn.stepsOnAdvance, m)
431 }
432 }
433 rn.raft.msgs = nil
434 rn.raft.msgsAfterAppend = nil
435 rn.raft.raftLog.acceptUnstable()
436 if len(rd.CommittedEntries) > 0 {
437 ents := rd.CommittedEntries
438 index := ents[len(ents)-1].Index
439 rn.raft.raftLog.acceptApplying(index, entsSize(ents), rn.applyUnstableEntries())
440 }
441
442 traceReady(rn.raft)
443}
444
445// applyUnstableEntries returns whether entries are allowed to be applied once
446// they are known to be committed but before they have been written locally to
447// stable storage.
448func (rn *RawNode) applyUnstableEntries() bool {
449 return !rn.asyncStorageWrites
450}
451
452// HasReady called when RawNode user need to check if any Ready pending.
453func (rn *RawNode) HasReady() bool {
454 // TODO(nvanbenschoten): order these cases in terms of cost and frequency.
455 r := rn.raft
456 if softSt := r.softState(); !softSt.equal(rn.prevSoftSt) {
457 return true
458 }
459 if hardSt := r.hardState(); !IsEmptyHardState(hardSt) && !isHardStateEqual(hardSt, rn.prevHardSt) {
460 return true
461 }
462 if r.raftLog.hasNextUnstableSnapshot() {
463 return true
464 }
465 if len(r.msgs) > 0 || len(r.msgsAfterAppend) > 0 {
466 return true
467 }
468 if r.raftLog.hasNextUnstableEnts() || r.raftLog.hasNextCommittedEnts(rn.applyUnstableEntries()) {
469 return true
470 }
471 if len(r.readStates) != 0 {
472 return true
473 }
474 return false
475}
476
477// Advance notifies the RawNode that the application has applied and saved progress in the
478// last Ready results.
479//
480// NOTE: Advance must not be called when using AsyncStorageWrites. Response messages from
481// the local append and apply threads take its place.
482func (rn *RawNode) Advance(_ Ready) {
483 // The actions performed by this function are encoded into stepsOnAdvance in
484 // acceptReady. In earlier versions of this library, they were computed from
485 // the provided Ready struct. Retain the unused parameter for compatibility.
486 if rn.asyncStorageWrites {
487 rn.raft.logger.Panicf("Advance must not be called when using AsyncStorageWrites")
488 }
489 for i, m := range rn.stepsOnAdvance {
490 _ = rn.raft.Step(m)
491 rn.stepsOnAdvance[i] = pb.Message{}
492 }
493 rn.stepsOnAdvance = rn.stepsOnAdvance[:0]
494}
495
496// Status returns the current status of the given group. This allocates, see
497// BasicStatus and WithProgress for allocation-friendlier choices.
498func (rn *RawNode) Status() Status {
499 status := getStatus(rn.raft)
500 return status
501}
502
503// BasicStatus returns a BasicStatus. Notably this does not contain the
504// Progress map; see WithProgress for an allocation-free way to inspect it.
505func (rn *RawNode) BasicStatus() BasicStatus {
506 return getBasicStatus(rn.raft)
507}
508
509// ProgressType indicates the type of replica a Progress corresponds to.
510type ProgressType byte
511
512const (
513 // ProgressTypePeer accompanies a Progress for a regular peer replica.
514 ProgressTypePeer ProgressType = iota
515 // ProgressTypeLearner accompanies a Progress for a learner replica.
516 ProgressTypeLearner
517)
518
519// WithProgress is a helper to introspect the Progress for this node and its
520// peers.
521func (rn *RawNode) WithProgress(visitor func(id uint64, typ ProgressType, pr tracker.Progress)) {
522 rn.raft.trk.Visit(func(id uint64, pr *tracker.Progress) {
523 typ := ProgressTypePeer
524 if pr.IsLearner {
525 typ = ProgressTypeLearner
526 }
527 p := *pr
528 p.Inflights = nil
529 visitor(id, typ, p)
530 })
531}
532
533// ReportUnreachable reports the given node is not reachable for the last send.
534func (rn *RawNode) ReportUnreachable(id uint64) {
535 _ = rn.raft.Step(pb.Message{Type: pb.MsgUnreachable, From: id})
536}
537
538// ReportSnapshot reports the status of the sent snapshot.
539func (rn *RawNode) ReportSnapshot(id uint64, status SnapshotStatus) {
540 rej := status == SnapshotFailure
541
542 _ = rn.raft.Step(pb.Message{Type: pb.MsgSnapStatus, From: id, Reject: rej})
543}
544
545// TransferLeader tries to transfer leadership to the given transferee.
546func (rn *RawNode) TransferLeader(transferee uint64) {
547 _ = rn.raft.Step(pb.Message{Type: pb.MsgTransferLeader, From: transferee})
548}
549
550// ForgetLeader forgets a follower's current leader, changing it to None.
551// See (Node).ForgetLeader for details.
552func (rn *RawNode) ForgetLeader() error {
553 return rn.raft.Step(pb.Message{Type: pb.MsgForgetLeader})
554}
555
556// ReadIndex requests a read state. The read state will be set in ready.
557// Read State has a read index. Once the application advances further than the read
558// index, any linearizable read requests issued before the read request can be
559// processed safely. The read state will have the same rctx attached.
560func (rn *RawNode) ReadIndex(rctx []byte) {
561 _ = rn.raft.Step(pb.Message{Type: pb.MsgReadIndex, Entries: []pb.Entry{{Data: rctx}}})
562}