vendor/go.etcd.io/raft/v3/rawnode.go - voltha-openolt-adapter - Gitiles

 // Copyright 2015 The etcd Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package raft

 import (
 	"errors"

 	pb "go.etcd.io/raft/v3/raftpb"
 	"go.etcd.io/raft/v3/tracker"
 )

 // ErrStepLocalMsg is returned when try to step a local raft message
 var ErrStepLocalMsg = errors.New("raft: cannot step raft local message")

 // ErrStepPeerNotFound is returned when try to step a response message
 // but there is no peer found in raft.trk for that node.
 var ErrStepPeerNotFound = errors.New("raft: cannot step as peer not found")

 // RawNode is a thread-unsafe Node.
 // The methods of this struct correspond to the methods of Node and are described
 // more fully there.
 type RawNode struct {
 	raft               *raft
 	asyncStorageWrites bool

 	// Mutable fields.
 	prevSoftSt     *SoftState
 	prevHardSt     pb.HardState
 	stepsOnAdvance []pb.Message
 }

 // NewRawNode instantiates a RawNode from the given configuration.
 //
 // See Bootstrap() for bootstrapping an initial state; this replaces the former
 // 'peers' argument to this method (with identical behavior). However, It is
 // recommended that instead of calling Bootstrap, applications bootstrap their
 // state manually by setting up a Storage that has a first index > 1 and which
 // stores the desired ConfState as its InitialState.
 func NewRawNode(config *Config) (*RawNode, error) {
 	r := newRaft(config)
 	rn := &RawNode{
 		raft: r,
 	}
 	rn.asyncStorageWrites = config.AsyncStorageWrites
 	ss := r.softState()
 	rn.prevSoftSt = &ss
 	rn.prevHardSt = r.hardState()
 	return rn, nil
 }

 // Tick advances the internal logical clock by a single tick.
 func (rn *RawNode) Tick() {
 	rn.raft.tick()
 }

 // TickQuiesced advances the internal logical clock by a single tick without
 // performing any other state machine processing. It allows the caller to avoid
 // periodic heartbeats and elections when all of the peers in a Raft group are
 // known to be at the same state. Expected usage is to periodically invoke Tick
 // or TickQuiesced depending on whether the group is "active" or "quiesced".
 //
 // WARNING: Be very careful about using this method as it subverts the Raft
 // state machine. You should probably be using Tick instead.
 //
 // DEPRECATED: This method will be removed in a future release.
 func (rn *RawNode) TickQuiesced() {
 	rn.raft.electionElapsed++
 }

 // Campaign causes this RawNode to transition to candidate state.
 func (rn *RawNode) Campaign() error {
 	return rn.raft.Step(pb.Message{
 		Type: pb.MsgHup,
 	})
 }

 // Propose proposes data be appended to the raft log.
 func (rn *RawNode) Propose(data []byte) error {
 	return rn.raft.Step(pb.Message{
 		Type: pb.MsgProp,
 		From: rn.raft.id,
 		Entries: []pb.Entry{
 			{Data: data},
 		}})
 }

 // ProposeConfChange proposes a config change. See (Node).ProposeConfChange for
 // details.
 func (rn *RawNode) ProposeConfChange(cc pb.ConfChangeI) error {
 	m, err := confChangeToMsg(cc)
 	if err != nil {
 		return err
 	}
 	return rn.raft.Step(m)
 }

 // ApplyConfChange applies a config change to the local node. The app must call
 // this when it applies a configuration change, except when it decides to reject
 // the configuration change, in which case no call must take place.
 func (rn *RawNode) ApplyConfChange(cc pb.ConfChangeI) *pb.ConfState {
 	cs := rn.raft.applyConfChange(cc.AsV2())
 	return &cs
 }

 // Step advances the state machine using the given message.
 func (rn *RawNode) Step(m pb.Message) error {
 	// Ignore unexpected local messages receiving over network.
 	if IsLocalMsg(m.Type) && !IsLocalMsgTarget(m.From) {
 		return ErrStepLocalMsg
 	}
 	if IsResponseMsg(m.Type) && !IsLocalMsgTarget(m.From) && rn.raft.trk.Progress[m.From] == nil {
 		return ErrStepPeerNotFound
 	}
 	return rn.raft.Step(m)
 }

 // Ready returns the outstanding work that the application needs to handle. This
 // includes appending and applying entries or a snapshot, updating the HardState,
 // and sending messages. The returned Ready() *must* be handled and subsequently
 // passed back via Advance().
 func (rn *RawNode) Ready() Ready {
 	rd := rn.readyWithoutAccept()
 	rn.acceptReady(rd)
 	return rd
 }

 // readyWithoutAccept returns a Ready. This is a read-only operation, i.e. there
 // is no obligation that the Ready must be handled.
 func (rn *RawNode) readyWithoutAccept() Ready {
 	r := rn.raft

 	rd := Ready{
 		Entries:          r.raftLog.nextUnstableEnts(),
 		CommittedEntries: r.raftLog.nextCommittedEnts(rn.applyUnstableEntries()),
 		Messages:         r.msgs,
 	}
 	if softSt := r.softState(); !softSt.equal(rn.prevSoftSt) {
 		// Allocate only when SoftState changes.
 		escapingSoftSt := softSt
 		rd.SoftState = &escapingSoftSt
 	}
 	if hardSt := r.hardState(); !isHardStateEqual(hardSt, rn.prevHardSt) {
 		rd.HardState = hardSt
 	}
 	if r.raftLog.hasNextUnstableSnapshot() {
 		rd.Snapshot = *r.raftLog.nextUnstableSnapshot()
 	}
 	if len(r.readStates) != 0 {
 		rd.ReadStates = r.readStates
 	}
 	rd.MustSync = MustSync(r.hardState(), rn.prevHardSt, len(rd.Entries))

 	if rn.asyncStorageWrites {
 		// If async storage writes are enabled, enqueue messages to
 		// local storage threads, where applicable.
 		if needStorageAppendMsg(r, rd) {
 			m := newStorageAppendMsg(r, rd)
 			rd.Messages = append(rd.Messages, m)
 		}
 		if needStorageApplyMsg(rd) {
 			m := newStorageApplyMsg(r, rd)
 			rd.Messages = append(rd.Messages, m)
 		}
 	} else {
 		// If async storage writes are disabled, immediately enqueue
 		// msgsAfterAppend to be sent out. The Ready struct contract
 		// mandates that Messages cannot be sent until after Entries
 		// are written to stable storage.
 		for _, m := range r.msgsAfterAppend {
 			if m.To != r.id {
 				rd.Messages = append(rd.Messages, m)
 			}
 		}
 	}

 	return rd
 }

 // MustSync returns true if the hard state and count of Raft entries indicate
 // that a synchronous write to persistent storage is required.
 func MustSync(st, prevst pb.HardState, entsnum int) bool {
 	// Persistent state on all servers:
 	// (Updated on stable storage before responding to RPCs)
 	// currentTerm
 	// votedFor
 	// log entries[]
 	return entsnum != 0 || st.Vote != prevst.Vote || st.Term != prevst.Term
 }

 func needStorageAppendMsg(r *raft, rd Ready) bool {
 	// Return true if log entries, hard state, or a snapshot need to be written
 	// to stable storage. Also return true if any messages are contingent on all
 	// prior MsgStorageAppend being processed.
 	return len(rd.Entries) > 0 ||
 		!IsEmptyHardState(rd.HardState) ||
 		!IsEmptySnap(rd.Snapshot) ||
 		len(r.msgsAfterAppend) > 0
 }

 func needStorageAppendRespMsg(r *raft, rd Ready) bool {
 	// Return true if raft needs to hear about stabilized entries or an applied
 	// snapshot. See the comment in newStorageAppendRespMsg, which explains why
 	// we check hasNextOrInProgressUnstableEnts instead of len(rd.Entries) > 0.
 	return r.raftLog.hasNextOrInProgressUnstableEnts() ||
 		!IsEmptySnap(rd.Snapshot)
 }

 // newStorageAppendMsg creates the message that should be sent to the local
 // append thread to instruct it to append log entries, write an updated hard
 // state, and apply a snapshot. The message also carries a set of responses
 // that should be delivered after the rest of the message is processed. Used
 // with AsyncStorageWrites.
 func newStorageAppendMsg(r *raft, rd Ready) pb.Message {
 	m := pb.Message{
 		Type:    pb.MsgStorageAppend,
 		To:      LocalAppendThread,
 		From:    r.id,
 		Entries: rd.Entries,
 	}
 	if !IsEmptyHardState(rd.HardState) {
 		// If the Ready includes a HardState update, assign each of its fields
 		// to the corresponding fields in the Message. This allows clients to
 		// reconstruct the HardState and save it to stable storage.
 		//
 		// If the Ready does not include a HardState update, make sure to not
 		// assign a value to any of the fields so that a HardState reconstructed
 		// from them will be empty (return true from raft.IsEmptyHardState).
 		m.Term = rd.Term
 		m.Vote = rd.Vote
 		m.Commit = rd.Commit
 	}
 	if !IsEmptySnap(rd.Snapshot) {
 		snap := rd.Snapshot
 		m.Snapshot = &snap
 	}
 	// Attach all messages in msgsAfterAppend as responses to be delivered after
 	// the message is processed, along with a self-directed MsgStorageAppendResp
 	// to acknowledge the entry stability.
 	//
 	// NB: it is important for performance that MsgStorageAppendResp message be
 	// handled after self-directed MsgAppResp messages on the leader (which will
 	// be contained in msgsAfterAppend). This ordering allows the MsgAppResp
 	// handling to use a fast-path in r.raftLog.term() before the newly appended
 	// entries are removed from the unstable log.
 	m.Responses = r.msgsAfterAppend
 	if needStorageAppendRespMsg(r, rd) {
 		m.Responses = append(m.Responses, newStorageAppendRespMsg(r, rd))
 	}
 	return m
 }

 // newStorageAppendRespMsg creates the message that should be returned to node
 // after the unstable log entries, hard state, and snapshot in the current Ready
 // (along with those in all prior Ready structs) have been saved to stable
 // storage.
 func newStorageAppendRespMsg(r *raft, rd Ready) pb.Message {
 	m := pb.Message{
 		Type: pb.MsgStorageAppendResp,
 		To:   r.id,
 		From: LocalAppendThread,
 		// Dropped after term change, see below.
 		Term: r.Term,
 	}
 	if r.raftLog.hasNextOrInProgressUnstableEnts() {
 		// If the raft log has unstable entries, attach the last index and term of the
 		// append to the response message. This (index, term) tuple will be handed back
 		// and consulted when the stability of those log entries is signaled to the
 		// unstable. If the (index, term) match the unstable log by the time the
 		// response is received (unstable.stableTo), the unstable log can be truncated.
 		//
 		// However, with just this logic, there would be an ABA problem[^1] that could
 		// lead to the unstable log and the stable log getting out of sync temporarily
 		// and leading to an inconsistent view. Consider the following example with 5
 		// nodes, A B C D E:
 		//
 		//  1. A is the leader.
 		//  2. A proposes some log entries but only B receives these entries.
 		//  3. B gets the Ready and the entries are appended asynchronously.
 		//  4. A crashes and C becomes leader after getting a vote from D and E.
 		//  5. C proposes some log entries and B receives these entries, overwriting the
 		//     previous unstable log entries that are in the process of being appended.
 		//     The entries have a larger term than the previous entries but the same
 		//     indexes. It begins appending these new entries asynchronously.
 		//  6. C crashes and A restarts and becomes leader again after getting the vote
 		//     from D and E.
 		//  7. B receives the entries from A which are the same as the ones from step 2,
 		//     overwriting the previous unstable log entries that are in the process of
 		//     being appended from step 5. The entries have the original terms and
 		//     indexes from step 2. Recall that log entries retain their original term
 		//     numbers when a leader replicates entries from previous terms. It begins
 		//     appending these new entries asynchronously.
 		//  8. The asynchronous log appends from the first Ready complete and stableTo
 		//     is called.
 		//  9. However, the log entries from the second Ready are still in the
 		//     asynchronous append pipeline and will overwrite (in stable storage) the
 		//     entries from the first Ready at some future point. We can't truncate the
 		//     unstable log yet or a future read from Storage might see the entries from
 		//     step 5 before they have been replaced by the entries from step 7.
 		//     Instead, we must wait until we are sure that the entries are stable and
 		//     that no in-progress appends might overwrite them before removing entries
 		//     from the unstable log.
 		//
 		// To prevent these kinds of problems, we also attach the current term to the
 		// MsgStorageAppendResp (above). If the term has changed by the time the
 		// MsgStorageAppendResp if returned, the response is ignored and the unstable
 		// log is not truncated. The unstable log is only truncated when the term has
 		// remained unchanged from the time that the MsgStorageAppend was sent to the
 		// time that the MsgStorageAppendResp is received, indicating that no-one else
 		// is in the process of truncating the stable log.
 		//
 		// However, this replaces a correctness problem with a liveness problem. If we
 		// only attempted to truncate the unstable log when appending new entries but
 		// also occasionally dropped these responses, then quiescence of new log entries
 		// could lead to the unstable log never being truncated.
 		//
 		// To combat this, we attempt to truncate the log on all MsgStorageAppendResp
 		// messages where the unstable log is not empty, not just those associated with
 		// entry appends. This includes MsgStorageAppendResp messages associated with an
 		// updated HardState, which occur after a term change.
 		//
 		// In other words, we set Index and LogTerm in a block that looks like:
 		//
 		//  if r.raftLog.hasNextOrInProgressUnstableEnts() { ... }
 		//
 		// not like:
 		//
 		//  if len(rd.Entries) > 0 { ... }
 		//
 		// To do so, we attach r.raftLog.lastIndex() and r.raftLog.lastTerm(), not the
 		// (index, term) of the last entry in rd.Entries. If rd.Entries is not empty,
 		// these will be the same. However, if rd.Entries is empty, we still want to
 		// attest that this (index, term) is correct at the current term, in case the
 		// MsgStorageAppend that contained the last entry in the unstable slice carried
 		// an earlier term and was dropped.
 		//
 		// A MsgStorageAppend with a new term is emitted on each term change. This is
 		// the same condition that causes MsgStorageAppendResp messages with earlier
 		// terms to be ignored. As a result, we are guaranteed that, assuming a bounded
 		// number of term changes, there will eventually be a MsgStorageAppendResp
 		// message that is not ignored. This means that entries in the unstable log
 		// which have been appended to stable storage will eventually be truncated and
 		// dropped from memory.
 		//
 		// [^1]: https://en.wikipedia.org/wiki/ABA_problem
 		last := r.raftLog.lastEntryID()
 		m.Index = last.index
 		m.LogTerm = last.term
 	}
 	if !IsEmptySnap(rd.Snapshot) {
 		snap := rd.Snapshot
 		m.Snapshot = &snap
 	}
 	return m
 }

 func needStorageApplyMsg(rd Ready) bool     { return len(rd.CommittedEntries) > 0 }
 func needStorageApplyRespMsg(rd Ready) bool { return needStorageApplyMsg(rd) }

 // newStorageApplyMsg creates the message that should be sent to the local
 // apply thread to instruct it to apply committed log entries. The message
 // also carries a response that should be delivered after the rest of the
 // message is processed. Used with AsyncStorageWrites.
 func newStorageApplyMsg(r *raft, rd Ready) pb.Message {
 	ents := rd.CommittedEntries
 	return pb.Message{
 		Type:    pb.MsgStorageApply,
 		To:      LocalApplyThread,
 		From:    r.id,
 		Term:    0, // committed entries don't apply under a specific term
 		Entries: ents,
 		Responses: []pb.Message{
 			newStorageApplyRespMsg(r, ents),
 		},
 	}
 }

 // newStorageApplyRespMsg creates the message that should be returned to node
 // after the committed entries in the current Ready (along with those in all
 // prior Ready structs) have been applied to the local state machine.
 func newStorageApplyRespMsg(r *raft, ents []pb.Entry) pb.Message {
 	return pb.Message{
 		Type:    pb.MsgStorageApplyResp,
 		To:      r.id,
 		From:    LocalApplyThread,
 		Term:    0, // committed entries don't apply under a specific term
 		Entries: ents,
 	}
 }

 // acceptReady is called when the consumer of the RawNode has decided to go
 // ahead and handle a Ready. Nothing must alter the state of the RawNode between
 // this call and the prior call to Ready().
 func (rn *RawNode) acceptReady(rd Ready) {
 	if rd.SoftState != nil {
 		rn.prevSoftSt = rd.SoftState
 	}
 	if !IsEmptyHardState(rd.HardState) {
 		rn.prevHardSt = rd.HardState
 	}
 	if len(rd.ReadStates) != 0 {
 		rn.raft.readStates = nil
 	}
 	if !rn.asyncStorageWrites {
 		if len(rn.stepsOnAdvance) != 0 {
 			rn.raft.logger.Panicf("two accepted Ready structs without call to Advance")
 		}
 		for _, m := range rn.raft.msgsAfterAppend {
 			if m.To == rn.raft.id {
 				rn.stepsOnAdvance = append(rn.stepsOnAdvance, m)
 			}
 		}
 		if needStorageAppendRespMsg(rn.raft, rd) {
 			m := newStorageAppendRespMsg(rn.raft, rd)
 			rn.stepsOnAdvance = append(rn.stepsOnAdvance, m)
 		}
 		if needStorageApplyRespMsg(rd) {
 			m := newStorageApplyRespMsg(rn.raft, rd.CommittedEntries)
 			rn.stepsOnAdvance = append(rn.stepsOnAdvance, m)
 		}
 	}
 	rn.raft.msgs = nil
 	rn.raft.msgsAfterAppend = nil
 	rn.raft.raftLog.acceptUnstable()
 	if len(rd.CommittedEntries) > 0 {
 		ents := rd.CommittedEntries
 		index := ents[len(ents)-1].Index
 		rn.raft.raftLog.acceptApplying(index, entsSize(ents), rn.applyUnstableEntries())
 	}

 	traceReady(rn.raft)
 }

 // applyUnstableEntries returns whether entries are allowed to be applied once
 // they are known to be committed but before they have been written locally to
 // stable storage.
 func (rn *RawNode) applyUnstableEntries() bool {
 	return !rn.asyncStorageWrites
 }

 // HasReady called when RawNode user need to check if any Ready pending.
 func (rn *RawNode) HasReady() bool {
 	// TODO(nvanbenschoten): order these cases in terms of cost and frequency.
 	r := rn.raft
 	if softSt := r.softState(); !softSt.equal(rn.prevSoftSt) {
 		return true
 	}
 	if hardSt := r.hardState(); !IsEmptyHardState(hardSt) && !isHardStateEqual(hardSt, rn.prevHardSt) {
 		return true
 	}
 	if r.raftLog.hasNextUnstableSnapshot() {
 		return true
 	}
 	if len(r.msgs) > 0 || len(r.msgsAfterAppend) > 0 {
 		return true
 	}
 	if r.raftLog.hasNextUnstableEnts() || r.raftLog.hasNextCommittedEnts(rn.applyUnstableEntries()) {
 		return true
 	}
 	if len(r.readStates) != 0 {
 		return true
 	}
 	return false
 }

 // Advance notifies the RawNode that the application has applied and saved progress in the
 // last Ready results.
 //
 // NOTE: Advance must not be called when using AsyncStorageWrites. Response messages from
 // the local append and apply threads take its place.
 func (rn *RawNode) Advance(_ Ready) {
 	// The actions performed by this function are encoded into stepsOnAdvance in
 	// acceptReady. In earlier versions of this library, they were computed from
 	// the provided Ready struct. Retain the unused parameter for compatibility.
 	if rn.asyncStorageWrites {
 		rn.raft.logger.Panicf("Advance must not be called when using AsyncStorageWrites")
 	}
 	for i, m := range rn.stepsOnAdvance {
 		_ = rn.raft.Step(m)
 		rn.stepsOnAdvance[i] = pb.Message{}
 	}
 	rn.stepsOnAdvance = rn.stepsOnAdvance[:0]
 }

 // Status returns the current status of the given group. This allocates, see
 // BasicStatus and WithProgress for allocation-friendlier choices.
 func (rn *RawNode) Status() Status {
 	status := getStatus(rn.raft)
 	return status
 }

 // BasicStatus returns a BasicStatus. Notably this does not contain the
 // Progress map; see WithProgress for an allocation-free way to inspect it.
 func (rn *RawNode) BasicStatus() BasicStatus {
 	return getBasicStatus(rn.raft)
 }

 // ProgressType indicates the type of replica a Progress corresponds to.
 type ProgressType byte

 const (
 	// ProgressTypePeer accompanies a Progress for a regular peer replica.
 	ProgressTypePeer ProgressType = iota
 	// ProgressTypeLearner accompanies a Progress for a learner replica.
 	ProgressTypeLearner
 )

 // WithProgress is a helper to introspect the Progress for this node and its
 // peers.
 func (rn *RawNode) WithProgress(visitor func(id uint64, typ ProgressType, pr tracker.Progress)) {
 	rn.raft.trk.Visit(func(id uint64, pr *tracker.Progress) {
 		typ := ProgressTypePeer
 		if pr.IsLearner {
 			typ = ProgressTypeLearner
 		}
 		p := *pr
 		p.Inflights = nil
 		visitor(id, typ, p)
 	})
 }

 // ReportUnreachable reports the given node is not reachable for the last send.
 func (rn *RawNode) ReportUnreachable(id uint64) {
 	_ = rn.raft.Step(pb.Message{Type: pb.MsgUnreachable, From: id})
 }

 // ReportSnapshot reports the status of the sent snapshot.
 func (rn *RawNode) ReportSnapshot(id uint64, status SnapshotStatus) {
 	rej := status == SnapshotFailure

 	_ = rn.raft.Step(pb.Message{Type: pb.MsgSnapStatus, From: id, Reject: rej})
 }

 // TransferLeader tries to transfer leadership to the given transferee.
 func (rn *RawNode) TransferLeader(transferee uint64) {
 	_ = rn.raft.Step(pb.Message{Type: pb.MsgTransferLeader, From: transferee})
 }

 // ForgetLeader forgets a follower's current leader, changing it to None.
 // See (Node).ForgetLeader for details.
 func (rn *RawNode) ForgetLeader() error {
 	return rn.raft.Step(pb.Message{Type: pb.MsgForgetLeader})
 }

 // ReadIndex requests a read state. The read state will be set in ready.
 // Read State has a read index. Once the application advances further than the read
 // index, any linearizable read requests issued before the read request can be
 // processed safely. The read state will have the same rctx attached.
 func (rn *RawNode) ReadIndex(rctx []byte) {
 	_ = rn.raft.Step(pb.Message{Type: pb.MsgReadIndex, Entries: []pb.Entry{{Data: rctx}}})
 }