| Abhay Kumar | 40252eb | 2025-10-13 13:25:53 +0000 | [diff] [blame^] | 1 | syntax = "proto2"; |
| 2 | package raftpb; |
| 3 | |
| 4 | import "gogoproto/gogo.proto"; |
| 5 | |
| 6 | option (gogoproto.marshaler_all) = true; |
| 7 | option (gogoproto.sizer_all) = true; |
| 8 | option (gogoproto.unmarshaler_all) = true; |
| 9 | option (gogoproto.goproto_getters_all) = false; |
| 10 | option (gogoproto.goproto_enum_prefix_all) = false; |
| 11 | option (gogoproto.goproto_unkeyed_all) = false; |
| 12 | option (gogoproto.goproto_unrecognized_all) = false; |
| 13 | option (gogoproto.goproto_sizecache_all) = false; |
| 14 | |
| 15 | enum EntryType { |
| 16 | EntryNormal = 0; |
| 17 | EntryConfChange = 1; // corresponds to pb.ConfChange |
| 18 | EntryConfChangeV2 = 2; // corresponds to pb.ConfChangeV2 |
| 19 | } |
| 20 | |
| 21 | message Entry { |
| 22 | optional uint64 Term = 2 [(gogoproto.nullable) = false]; // must be 64-bit aligned for atomic operations |
| 23 | optional uint64 Index = 3 [(gogoproto.nullable) = false]; // must be 64-bit aligned for atomic operations |
| 24 | optional EntryType Type = 1 [(gogoproto.nullable) = false]; |
| 25 | optional bytes Data = 4; |
| 26 | } |
| 27 | |
| 28 | message SnapshotMetadata { |
| 29 | optional ConfState conf_state = 1 [(gogoproto.nullable) = false]; |
| 30 | optional uint64 index = 2 [(gogoproto.nullable) = false]; |
| 31 | optional uint64 term = 3 [(gogoproto.nullable) = false]; |
| 32 | } |
| 33 | |
| 34 | message Snapshot { |
| 35 | optional bytes data = 1; |
| 36 | optional SnapshotMetadata metadata = 2 [(gogoproto.nullable) = false]; |
| 37 | } |
| 38 | |
| 39 | // For description of different message types, see: |
| 40 | // https://pkg.go.dev/go.etcd.io/raft/v3#hdr-MessageType |
| 41 | enum MessageType { |
| 42 | MsgHup = 0; |
| 43 | MsgBeat = 1; |
| 44 | MsgProp = 2; |
| 45 | MsgApp = 3; |
| 46 | MsgAppResp = 4; |
| 47 | MsgVote = 5; |
| 48 | MsgVoteResp = 6; |
| 49 | MsgSnap = 7; |
| 50 | MsgHeartbeat = 8; |
| 51 | MsgHeartbeatResp = 9; |
| 52 | MsgUnreachable = 10; |
| 53 | MsgSnapStatus = 11; |
| 54 | MsgCheckQuorum = 12; |
| 55 | MsgTransferLeader = 13; |
| 56 | MsgTimeoutNow = 14; |
| 57 | MsgReadIndex = 15; |
| 58 | MsgReadIndexResp = 16; |
| 59 | MsgPreVote = 17; |
| 60 | MsgPreVoteResp = 18; |
| 61 | MsgStorageAppend = 19; |
| 62 | MsgStorageAppendResp = 20; |
| 63 | MsgStorageApply = 21; |
| 64 | MsgStorageApplyResp = 22; |
| 65 | MsgForgetLeader = 23; |
| 66 | // NOTE: when adding new message types, remember to update the isLocalMsg and |
| 67 | // isResponseMsg arrays in raft/util.go and update the corresponding tests in |
| 68 | // raft/util_test.go. |
| 69 | } |
| 70 | |
| 71 | message Message { |
| 72 | optional MessageType type = 1 [(gogoproto.nullable) = false]; |
| 73 | optional uint64 to = 2 [(gogoproto.nullable) = false]; |
| 74 | optional uint64 from = 3 [(gogoproto.nullable) = false]; |
| 75 | optional uint64 term = 4 [(gogoproto.nullable) = false]; |
| 76 | // logTerm is generally used for appending Raft logs to followers. For example, |
| 77 | // (type=MsgApp,index=100,logTerm=5) means the leader appends entries starting |
| 78 | // at index=101, and the term of the entry at index 100 is 5. |
| 79 | // (type=MsgAppResp,reject=true,index=100,logTerm=5) means follower rejects some |
| 80 | // entries from its leader as it already has an entry with term 5 at index 100. |
| 81 | // (type=MsgStorageAppendResp,index=100,logTerm=5) means the local node wrote |
| 82 | // entries up to index=100 in stable storage, and the term of the entry at index |
| 83 | // 100 was 5. This doesn't always mean that the corresponding MsgStorageAppend |
| 84 | // message was the one that carried these entries, just that those entries were |
| 85 | // stable at the time of processing the corresponding MsgStorageAppend. |
| 86 | optional uint64 logTerm = 5 [(gogoproto.nullable) = false]; |
| 87 | optional uint64 index = 6 [(gogoproto.nullable) = false]; |
| 88 | repeated Entry entries = 7 [(gogoproto.nullable) = false]; |
| 89 | optional uint64 commit = 8 [(gogoproto.nullable) = false]; |
| 90 | // (type=MsgStorageAppend,vote=5,term=10) means the local node is voting for |
| 91 | // peer 5 in term 10. For MsgStorageAppends, the term, vote, and commit fields |
| 92 | // will either all be set (to facilitate the construction of a HardState) if |
| 93 | // any of the fields have changed or will all be unset if none of the fields |
| 94 | // have changed. |
| 95 | optional uint64 vote = 13 [(gogoproto.nullable) = false]; |
| 96 | // snapshot is non-nil and non-empty for MsgSnap messages and nil for all other |
| 97 | // message types. However, peer nodes running older binary versions may send a |
| 98 | // non-nil, empty value for the snapshot field of non-MsgSnap messages. Code |
| 99 | // should be prepared to handle such messages. |
| 100 | optional Snapshot snapshot = 9 [(gogoproto.nullable) = true]; |
| 101 | optional bool reject = 10 [(gogoproto.nullable) = false]; |
| 102 | optional uint64 rejectHint = 11 [(gogoproto.nullable) = false]; |
| 103 | optional bytes context = 12 [(gogoproto.nullable) = true]; |
| 104 | // responses are populated by a raft node to instruct storage threads on how |
| 105 | // to respond and who to respond to when the work associated with a message |
| 106 | // is complete. Populated for MsgStorageAppend and MsgStorageApply messages. |
| 107 | repeated Message responses = 14 [(gogoproto.nullable) = false]; |
| 108 | } |
| 109 | |
| 110 | message HardState { |
| 111 | optional uint64 term = 1 [(gogoproto.nullable) = false]; |
| 112 | optional uint64 vote = 2 [(gogoproto.nullable) = false]; |
| 113 | optional uint64 commit = 3 [(gogoproto.nullable) = false]; |
| 114 | } |
| 115 | |
| 116 | // ConfChangeTransition specifies the behavior of a configuration change with |
| 117 | // respect to joint consensus. |
| 118 | enum ConfChangeTransition { |
| 119 | // Automatically use the simple protocol if possible, otherwise fall back |
| 120 | // to ConfChangeJointImplicit. Most applications will want to use this. |
| 121 | ConfChangeTransitionAuto = 0; |
| 122 | // Use joint consensus unconditionally, and transition out of them |
| 123 | // automatically (by proposing a zero configuration change). |
| 124 | // |
| 125 | // This option is suitable for applications that want to minimize the time |
| 126 | // spent in the joint configuration and do not store the joint configuration |
| 127 | // in the state machine (outside of InitialState). |
| 128 | ConfChangeTransitionJointImplicit = 1; |
| 129 | // Use joint consensus and remain in the joint configuration until the |
| 130 | // application proposes a no-op configuration change. This is suitable for |
| 131 | // applications that want to explicitly control the transitions, for example |
| 132 | // to use a custom payload (via the Context field). |
| 133 | ConfChangeTransitionJointExplicit = 2; |
| 134 | } |
| 135 | |
| 136 | message ConfState { |
| 137 | // The voters in the incoming config. (If the configuration is not joint, |
| 138 | // then the outgoing config is empty). |
| 139 | repeated uint64 voters = 1; |
| 140 | // The learners in the incoming config. |
| 141 | repeated uint64 learners = 2; |
| 142 | // The voters in the outgoing config. |
| 143 | repeated uint64 voters_outgoing = 3; |
| 144 | // The nodes that will become learners when the outgoing config is removed. |
| 145 | // These nodes are necessarily currently in nodes_joint (or they would have |
| 146 | // been added to the incoming config right away). |
| 147 | repeated uint64 learners_next = 4; |
| 148 | // If set, the config is joint and Raft will automatically transition into |
| 149 | // the final config (i.e. remove the outgoing config) when this is safe. |
| 150 | optional bool auto_leave = 5 [(gogoproto.nullable) = false]; |
| 151 | } |
| 152 | |
| 153 | enum ConfChangeType { |
| 154 | ConfChangeAddNode = 0; |
| 155 | ConfChangeRemoveNode = 1; |
| 156 | ConfChangeUpdateNode = 2; |
| 157 | ConfChangeAddLearnerNode = 3; |
| 158 | } |
| 159 | |
| 160 | message ConfChange { |
| 161 | optional ConfChangeType type = 2 [(gogoproto.nullable) = false]; |
| 162 | optional uint64 node_id = 3 [(gogoproto.nullable) = false, (gogoproto.customname) = "NodeID"]; |
| 163 | optional bytes context = 4; |
| 164 | |
| 165 | // NB: this is used only by etcd to thread through a unique identifier. |
| 166 | // Ideally it should really use the Context instead. No counterpart to |
| 167 | // this field exists in ConfChangeV2. |
| 168 | optional uint64 id = 1 [(gogoproto.nullable) = false, (gogoproto.customname) = "ID"]; |
| 169 | } |
| 170 | |
| 171 | // ConfChangeSingle is an individual configuration change operation. Multiple |
| 172 | // such operations can be carried out atomically via a ConfChangeV2. |
| 173 | message ConfChangeSingle { |
| 174 | optional ConfChangeType type = 1 [(gogoproto.nullable) = false]; |
| 175 | optional uint64 node_id = 2 [(gogoproto.nullable) = false, (gogoproto.customname) = "NodeID"]; |
| 176 | } |
| 177 | |
| 178 | // ConfChangeV2 messages initiate configuration changes. They support both the |
| 179 | // simple "one at a time" membership change protocol and full Joint Consensus |
| 180 | // allowing for arbitrary changes in membership. |
| 181 | // |
| 182 | // The supplied context is treated as an opaque payload and can be used to |
| 183 | // attach an action on the state machine to the application of the config change |
| 184 | // proposal. Note that contrary to Joint Consensus as outlined in the Raft |
| 185 | // paper[1], configuration changes become active when they are *applied* to the |
| 186 | // state machine (not when they are appended to the log). |
| 187 | // |
| 188 | // The simple protocol can be used whenever only a single change is made. |
| 189 | // |
| 190 | // Non-simple changes require the use of Joint Consensus, for which two |
| 191 | // configuration changes are run. The first configuration change specifies the |
| 192 | // desired changes and transitions the Raft group into the joint configuration, |
| 193 | // in which quorum requires a majority of both the pre-changes and post-changes |
| 194 | // configuration. Joint Consensus avoids entering fragile intermediate |
| 195 | // configurations that could compromise survivability. For example, without the |
| 196 | // use of Joint Consensus and running across three availability zones with a |
| 197 | // replication factor of three, it is not possible to replace a voter without |
| 198 | // entering an intermediate configuration that does not survive the outage of |
| 199 | // one availability zone. |
| 200 | // |
| 201 | // The provided ConfChangeTransition specifies how (and whether) Joint Consensus |
| 202 | // is used, and assigns the task of leaving the joint configuration either to |
| 203 | // Raft or the application. Leaving the joint configuration is accomplished by |
| 204 | // proposing a ConfChangeV2 with only and optionally the Context field |
| 205 | // populated. |
| 206 | // |
| 207 | // For details on Raft membership changes, see: |
| 208 | // |
| 209 | // [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf |
| 210 | message ConfChangeV2 { |
| 211 | optional ConfChangeTransition transition = 1 [(gogoproto.nullable) = false]; |
| 212 | repeated ConfChangeSingle changes = 2 [(gogoproto.nullable) = false]; |
| 213 | optional bytes context = 3; |
| 214 | } |