| syntax = "proto2"; |
| package raftpb; |
| |
| import "gogoproto/gogo.proto"; |
| |
| option (gogoproto.marshaler_all) = true; |
| option (gogoproto.sizer_all) = true; |
| option (gogoproto.unmarshaler_all) = true; |
| option (gogoproto.goproto_getters_all) = false; |
| option (gogoproto.goproto_enum_prefix_all) = false; |
| option (gogoproto.goproto_unkeyed_all) = false; |
| option (gogoproto.goproto_unrecognized_all) = false; |
| option (gogoproto.goproto_sizecache_all) = false; |
| |
| enum EntryType { |
| EntryNormal = 0; |
| EntryConfChange = 1; // corresponds to pb.ConfChange |
| EntryConfChangeV2 = 2; // corresponds to pb.ConfChangeV2 |
| } |
| |
| message Entry { |
| optional uint64 Term = 2 [(gogoproto.nullable) = false]; // must be 64-bit aligned for atomic operations |
| optional uint64 Index = 3 [(gogoproto.nullable) = false]; // must be 64-bit aligned for atomic operations |
| optional EntryType Type = 1 [(gogoproto.nullable) = false]; |
| optional bytes Data = 4; |
| } |
| |
| message SnapshotMetadata { |
| optional ConfState conf_state = 1 [(gogoproto.nullable) = false]; |
| optional uint64 index = 2 [(gogoproto.nullable) = false]; |
| optional uint64 term = 3 [(gogoproto.nullable) = false]; |
| } |
| |
| message Snapshot { |
| optional bytes data = 1; |
| optional SnapshotMetadata metadata = 2 [(gogoproto.nullable) = false]; |
| } |
| |
| // For description of different message types, see: |
| // https://pkg.go.dev/go.etcd.io/raft/v3#hdr-MessageType |
| enum MessageType { |
| MsgHup = 0; |
| MsgBeat = 1; |
| MsgProp = 2; |
| MsgApp = 3; |
| MsgAppResp = 4; |
| MsgVote = 5; |
| MsgVoteResp = 6; |
| MsgSnap = 7; |
| MsgHeartbeat = 8; |
| MsgHeartbeatResp = 9; |
| MsgUnreachable = 10; |
| MsgSnapStatus = 11; |
| MsgCheckQuorum = 12; |
| MsgTransferLeader = 13; |
| MsgTimeoutNow = 14; |
| MsgReadIndex = 15; |
| MsgReadIndexResp = 16; |
| MsgPreVote = 17; |
| MsgPreVoteResp = 18; |
| MsgStorageAppend = 19; |
| MsgStorageAppendResp = 20; |
| MsgStorageApply = 21; |
| MsgStorageApplyResp = 22; |
| MsgForgetLeader = 23; |
| // NOTE: when adding new message types, remember to update the isLocalMsg and |
| // isResponseMsg arrays in raft/util.go and update the corresponding tests in |
| // raft/util_test.go. |
| } |
| |
| message Message { |
| optional MessageType type = 1 [(gogoproto.nullable) = false]; |
| optional uint64 to = 2 [(gogoproto.nullable) = false]; |
| optional uint64 from = 3 [(gogoproto.nullable) = false]; |
| optional uint64 term = 4 [(gogoproto.nullable) = false]; |
| // logTerm is generally used for appending Raft logs to followers. For example, |
| // (type=MsgApp,index=100,logTerm=5) means the leader appends entries starting |
| // at index=101, and the term of the entry at index 100 is 5. |
| // (type=MsgAppResp,reject=true,index=100,logTerm=5) means follower rejects some |
| // entries from its leader as it already has an entry with term 5 at index 100. |
| // (type=MsgStorageAppendResp,index=100,logTerm=5) means the local node wrote |
| // entries up to index=100 in stable storage, and the term of the entry at index |
| // 100 was 5. This doesn't always mean that the corresponding MsgStorageAppend |
| // message was the one that carried these entries, just that those entries were |
| // stable at the time of processing the corresponding MsgStorageAppend. |
| optional uint64 logTerm = 5 [(gogoproto.nullable) = false]; |
| optional uint64 index = 6 [(gogoproto.nullable) = false]; |
| repeated Entry entries = 7 [(gogoproto.nullable) = false]; |
| optional uint64 commit = 8 [(gogoproto.nullable) = false]; |
| // (type=MsgStorageAppend,vote=5,term=10) means the local node is voting for |
| // peer 5 in term 10. For MsgStorageAppends, the term, vote, and commit fields |
| // will either all be set (to facilitate the construction of a HardState) if |
| // any of the fields have changed or will all be unset if none of the fields |
| // have changed. |
| optional uint64 vote = 13 [(gogoproto.nullable) = false]; |
| // snapshot is non-nil and non-empty for MsgSnap messages and nil for all other |
| // message types. However, peer nodes running older binary versions may send a |
| // non-nil, empty value for the snapshot field of non-MsgSnap messages. Code |
| // should be prepared to handle such messages. |
| optional Snapshot snapshot = 9 [(gogoproto.nullable) = true]; |
| optional bool reject = 10 [(gogoproto.nullable) = false]; |
| optional uint64 rejectHint = 11 [(gogoproto.nullable) = false]; |
| optional bytes context = 12 [(gogoproto.nullable) = true]; |
| // responses are populated by a raft node to instruct storage threads on how |
| // to respond and who to respond to when the work associated with a message |
| // is complete. Populated for MsgStorageAppend and MsgStorageApply messages. |
| repeated Message responses = 14 [(gogoproto.nullable) = false]; |
| } |
| |
| message HardState { |
| optional uint64 term = 1 [(gogoproto.nullable) = false]; |
| optional uint64 vote = 2 [(gogoproto.nullable) = false]; |
| optional uint64 commit = 3 [(gogoproto.nullable) = false]; |
| } |
| |
| // ConfChangeTransition specifies the behavior of a configuration change with |
| // respect to joint consensus. |
| enum ConfChangeTransition { |
| // Automatically use the simple protocol if possible, otherwise fall back |
| // to ConfChangeJointImplicit. Most applications will want to use this. |
| ConfChangeTransitionAuto = 0; |
| // Use joint consensus unconditionally, and transition out of them |
| // automatically (by proposing a zero configuration change). |
| // |
| // This option is suitable for applications that want to minimize the time |
| // spent in the joint configuration and do not store the joint configuration |
| // in the state machine (outside of InitialState). |
| ConfChangeTransitionJointImplicit = 1; |
| // Use joint consensus and remain in the joint configuration until the |
| // application proposes a no-op configuration change. This is suitable for |
| // applications that want to explicitly control the transitions, for example |
| // to use a custom payload (via the Context field). |
| ConfChangeTransitionJointExplicit = 2; |
| } |
| |
| message ConfState { |
| // The voters in the incoming config. (If the configuration is not joint, |
| // then the outgoing config is empty). |
| repeated uint64 voters = 1; |
| // The learners in the incoming config. |
| repeated uint64 learners = 2; |
| // The voters in the outgoing config. |
| repeated uint64 voters_outgoing = 3; |
| // The nodes that will become learners when the outgoing config is removed. |
| // These nodes are necessarily currently in nodes_joint (or they would have |
| // been added to the incoming config right away). |
| repeated uint64 learners_next = 4; |
| // If set, the config is joint and Raft will automatically transition into |
| // the final config (i.e. remove the outgoing config) when this is safe. |
| optional bool auto_leave = 5 [(gogoproto.nullable) = false]; |
| } |
| |
| enum ConfChangeType { |
| ConfChangeAddNode = 0; |
| ConfChangeRemoveNode = 1; |
| ConfChangeUpdateNode = 2; |
| ConfChangeAddLearnerNode = 3; |
| } |
| |
| message ConfChange { |
| optional ConfChangeType type = 2 [(gogoproto.nullable) = false]; |
| optional uint64 node_id = 3 [(gogoproto.nullable) = false, (gogoproto.customname) = "NodeID"]; |
| optional bytes context = 4; |
| |
| // NB: this is used only by etcd to thread through a unique identifier. |
| // Ideally it should really use the Context instead. No counterpart to |
| // this field exists in ConfChangeV2. |
| optional uint64 id = 1 [(gogoproto.nullable) = false, (gogoproto.customname) = "ID"]; |
| } |
| |
| // ConfChangeSingle is an individual configuration change operation. Multiple |
| // such operations can be carried out atomically via a ConfChangeV2. |
| message ConfChangeSingle { |
| optional ConfChangeType type = 1 [(gogoproto.nullable) = false]; |
| optional uint64 node_id = 2 [(gogoproto.nullable) = false, (gogoproto.customname) = "NodeID"]; |
| } |
| |
| // ConfChangeV2 messages initiate configuration changes. They support both the |
| // simple "one at a time" membership change protocol and full Joint Consensus |
| // allowing for arbitrary changes in membership. |
| // |
| // The supplied context is treated as an opaque payload and can be used to |
| // attach an action on the state machine to the application of the config change |
| // proposal. Note that contrary to Joint Consensus as outlined in the Raft |
| // paper[1], configuration changes become active when they are *applied* to the |
| // state machine (not when they are appended to the log). |
| // |
| // The simple protocol can be used whenever only a single change is made. |
| // |
| // Non-simple changes require the use of Joint Consensus, for which two |
| // configuration changes are run. The first configuration change specifies the |
| // desired changes and transitions the Raft group into the joint configuration, |
| // in which quorum requires a majority of both the pre-changes and post-changes |
| // configuration. Joint Consensus avoids entering fragile intermediate |
| // configurations that could compromise survivability. For example, without the |
| // use of Joint Consensus and running across three availability zones with a |
| // replication factor of three, it is not possible to replace a voter without |
| // entering an intermediate configuration that does not survive the outage of |
| // one availability zone. |
| // |
| // The provided ConfChangeTransition specifies how (and whether) Joint Consensus |
| // is used, and assigns the task of leaving the joint configuration either to |
| // Raft or the application. Leaving the joint configuration is accomplished by |
| // proposing a ConfChangeV2 with only and optionally the Context field |
| // populated. |
| // |
| // For details on Raft membership changes, see: |
| // |
| // [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf |
| message ConfChangeV2 { |
| optional ConfChangeTransition transition = 1 [(gogoproto.nullable) = false]; |
| repeated ConfChangeSingle changes = 2 [(gogoproto.nullable) = false]; |
| optional bytes context = 3; |
| } |