blob: a8598ee5c9d6f8b27a96cfcc07b5bbb3bf90f0eb [file] [log] [blame]
Abhay Kumar40252eb2025-10-13 13:25:53 +00001syntax = "proto2";
2package raftpb;
3
4import "gogoproto/gogo.proto";
5
6option (gogoproto.marshaler_all) = true;
7option (gogoproto.sizer_all) = true;
8option (gogoproto.unmarshaler_all) = true;
9option (gogoproto.goproto_getters_all) = false;
10option (gogoproto.goproto_enum_prefix_all) = false;
11option (gogoproto.goproto_unkeyed_all) = false;
12option (gogoproto.goproto_unrecognized_all) = false;
13option (gogoproto.goproto_sizecache_all) = false;
14
15enum EntryType {
16 EntryNormal = 0;
17 EntryConfChange = 1; // corresponds to pb.ConfChange
18 EntryConfChangeV2 = 2; // corresponds to pb.ConfChangeV2
19}
20
21message Entry {
22 optional uint64 Term = 2 [(gogoproto.nullable) = false]; // must be 64-bit aligned for atomic operations
23 optional uint64 Index = 3 [(gogoproto.nullable) = false]; // must be 64-bit aligned for atomic operations
24 optional EntryType Type = 1 [(gogoproto.nullable) = false];
25 optional bytes Data = 4;
26}
27
28message SnapshotMetadata {
29 optional ConfState conf_state = 1 [(gogoproto.nullable) = false];
30 optional uint64 index = 2 [(gogoproto.nullable) = false];
31 optional uint64 term = 3 [(gogoproto.nullable) = false];
32}
33
34message Snapshot {
35 optional bytes data = 1;
36 optional SnapshotMetadata metadata = 2 [(gogoproto.nullable) = false];
37}
38
39// For description of different message types, see:
40// https://pkg.go.dev/go.etcd.io/raft/v3#hdr-MessageType
41enum MessageType {
42 MsgHup = 0;
43 MsgBeat = 1;
44 MsgProp = 2;
45 MsgApp = 3;
46 MsgAppResp = 4;
47 MsgVote = 5;
48 MsgVoteResp = 6;
49 MsgSnap = 7;
50 MsgHeartbeat = 8;
51 MsgHeartbeatResp = 9;
52 MsgUnreachable = 10;
53 MsgSnapStatus = 11;
54 MsgCheckQuorum = 12;
55 MsgTransferLeader = 13;
56 MsgTimeoutNow = 14;
57 MsgReadIndex = 15;
58 MsgReadIndexResp = 16;
59 MsgPreVote = 17;
60 MsgPreVoteResp = 18;
61 MsgStorageAppend = 19;
62 MsgStorageAppendResp = 20;
63 MsgStorageApply = 21;
64 MsgStorageApplyResp = 22;
65 MsgForgetLeader = 23;
66 // NOTE: when adding new message types, remember to update the isLocalMsg and
67 // isResponseMsg arrays in raft/util.go and update the corresponding tests in
68 // raft/util_test.go.
69}
70
71message Message {
72 optional MessageType type = 1 [(gogoproto.nullable) = false];
73 optional uint64 to = 2 [(gogoproto.nullable) = false];
74 optional uint64 from = 3 [(gogoproto.nullable) = false];
75 optional uint64 term = 4 [(gogoproto.nullable) = false];
76 // logTerm is generally used for appending Raft logs to followers. For example,
77 // (type=MsgApp,index=100,logTerm=5) means the leader appends entries starting
78 // at index=101, and the term of the entry at index 100 is 5.
79 // (type=MsgAppResp,reject=true,index=100,logTerm=5) means follower rejects some
80 // entries from its leader as it already has an entry with term 5 at index 100.
81 // (type=MsgStorageAppendResp,index=100,logTerm=5) means the local node wrote
82 // entries up to index=100 in stable storage, and the term of the entry at index
83 // 100 was 5. This doesn't always mean that the corresponding MsgStorageAppend
84 // message was the one that carried these entries, just that those entries were
85 // stable at the time of processing the corresponding MsgStorageAppend.
86 optional uint64 logTerm = 5 [(gogoproto.nullable) = false];
87 optional uint64 index = 6 [(gogoproto.nullable) = false];
88 repeated Entry entries = 7 [(gogoproto.nullable) = false];
89 optional uint64 commit = 8 [(gogoproto.nullable) = false];
90 // (type=MsgStorageAppend,vote=5,term=10) means the local node is voting for
91 // peer 5 in term 10. For MsgStorageAppends, the term, vote, and commit fields
92 // will either all be set (to facilitate the construction of a HardState) if
93 // any of the fields have changed or will all be unset if none of the fields
94 // have changed.
95 optional uint64 vote = 13 [(gogoproto.nullable) = false];
96 // snapshot is non-nil and non-empty for MsgSnap messages and nil for all other
97 // message types. However, peer nodes running older binary versions may send a
98 // non-nil, empty value for the snapshot field of non-MsgSnap messages. Code
99 // should be prepared to handle such messages.
100 optional Snapshot snapshot = 9 [(gogoproto.nullable) = true];
101 optional bool reject = 10 [(gogoproto.nullable) = false];
102 optional uint64 rejectHint = 11 [(gogoproto.nullable) = false];
103 optional bytes context = 12 [(gogoproto.nullable) = true];
104 // responses are populated by a raft node to instruct storage threads on how
105 // to respond and who to respond to when the work associated with a message
106 // is complete. Populated for MsgStorageAppend and MsgStorageApply messages.
107 repeated Message responses = 14 [(gogoproto.nullable) = false];
108}
109
110message HardState {
111 optional uint64 term = 1 [(gogoproto.nullable) = false];
112 optional uint64 vote = 2 [(gogoproto.nullable) = false];
113 optional uint64 commit = 3 [(gogoproto.nullable) = false];
114}
115
116// ConfChangeTransition specifies the behavior of a configuration change with
117// respect to joint consensus.
118enum ConfChangeTransition {
119 // Automatically use the simple protocol if possible, otherwise fall back
120 // to ConfChangeJointImplicit. Most applications will want to use this.
121 ConfChangeTransitionAuto = 0;
122 // Use joint consensus unconditionally, and transition out of them
123 // automatically (by proposing a zero configuration change).
124 //
125 // This option is suitable for applications that want to minimize the time
126 // spent in the joint configuration and do not store the joint configuration
127 // in the state machine (outside of InitialState).
128 ConfChangeTransitionJointImplicit = 1;
129 // Use joint consensus and remain in the joint configuration until the
130 // application proposes a no-op configuration change. This is suitable for
131 // applications that want to explicitly control the transitions, for example
132 // to use a custom payload (via the Context field).
133 ConfChangeTransitionJointExplicit = 2;
134}
135
136message ConfState {
137 // The voters in the incoming config. (If the configuration is not joint,
138 // then the outgoing config is empty).
139 repeated uint64 voters = 1;
140 // The learners in the incoming config.
141 repeated uint64 learners = 2;
142 // The voters in the outgoing config.
143 repeated uint64 voters_outgoing = 3;
144 // The nodes that will become learners when the outgoing config is removed.
145 // These nodes are necessarily currently in nodes_joint (or they would have
146 // been added to the incoming config right away).
147 repeated uint64 learners_next = 4;
148 // If set, the config is joint and Raft will automatically transition into
149 // the final config (i.e. remove the outgoing config) when this is safe.
150 optional bool auto_leave = 5 [(gogoproto.nullable) = false];
151}
152
153enum ConfChangeType {
154 ConfChangeAddNode = 0;
155 ConfChangeRemoveNode = 1;
156 ConfChangeUpdateNode = 2;
157 ConfChangeAddLearnerNode = 3;
158}
159
160message ConfChange {
161 optional ConfChangeType type = 2 [(gogoproto.nullable) = false];
162 optional uint64 node_id = 3 [(gogoproto.nullable) = false, (gogoproto.customname) = "NodeID"];
163 optional bytes context = 4;
164
165 // NB: this is used only by etcd to thread through a unique identifier.
166 // Ideally it should really use the Context instead. No counterpart to
167 // this field exists in ConfChangeV2.
168 optional uint64 id = 1 [(gogoproto.nullable) = false, (gogoproto.customname) = "ID"];
169}
170
171// ConfChangeSingle is an individual configuration change operation. Multiple
172// such operations can be carried out atomically via a ConfChangeV2.
173message ConfChangeSingle {
174 optional ConfChangeType type = 1 [(gogoproto.nullable) = false];
175 optional uint64 node_id = 2 [(gogoproto.nullable) = false, (gogoproto.customname) = "NodeID"];
176}
177
178// ConfChangeV2 messages initiate configuration changes. They support both the
179// simple "one at a time" membership change protocol and full Joint Consensus
180// allowing for arbitrary changes in membership.
181//
182// The supplied context is treated as an opaque payload and can be used to
183// attach an action on the state machine to the application of the config change
184// proposal. Note that contrary to Joint Consensus as outlined in the Raft
185// paper[1], configuration changes become active when they are *applied* to the
186// state machine (not when they are appended to the log).
187//
188// The simple protocol can be used whenever only a single change is made.
189//
190// Non-simple changes require the use of Joint Consensus, for which two
191// configuration changes are run. The first configuration change specifies the
192// desired changes and transitions the Raft group into the joint configuration,
193// in which quorum requires a majority of both the pre-changes and post-changes
194// configuration. Joint Consensus avoids entering fragile intermediate
195// configurations that could compromise survivability. For example, without the
196// use of Joint Consensus and running across three availability zones with a
197// replication factor of three, it is not possible to replace a voter without
198// entering an intermediate configuration that does not survive the outage of
199// one availability zone.
200//
201// The provided ConfChangeTransition specifies how (and whether) Joint Consensus
202// is used, and assigns the task of leaving the joint configuration either to
203// Raft or the application. Leaving the joint configuration is accomplished by
204// proposing a ConfChangeV2 with only and optionally the Context field
205// populated.
206//
207// For details on Raft membership changes, see:
208//
209// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf
210message ConfChangeV2 {
211 optional ConfChangeTransition transition = 1 [(gogoproto.nullable) = false];
212 repeated ConfChangeSingle changes = 2 [(gogoproto.nullable) = false];
213 optional bytes context = 3;
214}