| Abhay Kumar | 40252eb | 2025-10-13 13:25:53 +0000 | [diff] [blame^] | 1 | // Copyright 2015 The etcd Authors |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | package raft |
| 16 | |
| 17 | import ( |
| 18 | "errors" |
| 19 | "sync" |
| 20 | |
| 21 | pb "go.etcd.io/raft/v3/raftpb" |
| 22 | ) |
| 23 | |
| 24 | // ErrCompacted is returned by Storage.Entries/Compact when a requested |
| 25 | // index is unavailable because it predates the last snapshot. |
| 26 | var ErrCompacted = errors.New("requested index is unavailable due to compaction") |
| 27 | |
| 28 | // ErrSnapOutOfDate is returned by Storage.CreateSnapshot when a requested |
| 29 | // index is older than the existing snapshot. |
| 30 | var ErrSnapOutOfDate = errors.New("requested index is older than the existing snapshot") |
| 31 | |
| 32 | // ErrUnavailable is returned by Storage interface when the requested log entries |
| 33 | // are unavailable. |
| 34 | var ErrUnavailable = errors.New("requested entry at index is unavailable") |
| 35 | |
| 36 | // ErrSnapshotTemporarilyUnavailable is returned by the Storage interface when the required |
| 37 | // snapshot is temporarily unavailable. |
| 38 | var ErrSnapshotTemporarilyUnavailable = errors.New("snapshot is temporarily unavailable") |
| 39 | |
| 40 | // Storage is an interface that may be implemented by the application |
| 41 | // to retrieve log entries from storage. |
| 42 | // |
| 43 | // If any Storage method returns an error, the raft instance will |
| 44 | // become inoperable and refuse to participate in elections; the |
| 45 | // application is responsible for cleanup and recovery in this case. |
| 46 | type Storage interface { |
| 47 | // TODO(tbg): split this into two interfaces, LogStorage and StateStorage. |
| 48 | |
| 49 | // InitialState returns the saved HardState and ConfState information. |
| 50 | InitialState() (pb.HardState, pb.ConfState, error) |
| 51 | |
| 52 | // Entries returns a slice of consecutive log entries in the range [lo, hi), |
| 53 | // starting from lo. The maxSize limits the total size of the log entries |
| 54 | // returned, but Entries returns at least one entry if any. |
| 55 | // |
| 56 | // The caller of Entries owns the returned slice, and may append to it. The |
| 57 | // individual entries in the slice must not be mutated, neither by the Storage |
| 58 | // implementation nor the caller. Note that raft may forward these entries |
| 59 | // back to the application via Ready struct, so the corresponding handler must |
| 60 | // not mutate entries either (see comments in Ready struct). |
| 61 | // |
| 62 | // Since the caller may append to the returned slice, Storage implementation |
| 63 | // must protect its state from corruption that such appends may cause. For |
| 64 | // example, common ways to do so are: |
| 65 | // - allocate the slice before returning it (safest option), |
| 66 | // - return a slice protected by Go full slice expression, which causes |
| 67 | // copying on appends (see MemoryStorage). |
| 68 | // |
| 69 | // Returns ErrCompacted if entry lo has been compacted, or ErrUnavailable if |
| 70 | // encountered an unavailable entry in [lo, hi). |
| 71 | Entries(lo, hi, maxSize uint64) ([]pb.Entry, error) |
| 72 | |
| 73 | // Term returns the term of entry i, which must be in the range |
| 74 | // [FirstIndex()-1, LastIndex()]. The term of the entry before |
| 75 | // FirstIndex is retained for matching purposes even though the |
| 76 | // rest of that entry may not be available. |
| 77 | Term(i uint64) (uint64, error) |
| 78 | // LastIndex returns the index of the last entry in the log. |
| 79 | LastIndex() (uint64, error) |
| 80 | // FirstIndex returns the index of the first log entry that is |
| 81 | // possibly available via Entries (older entries have been incorporated |
| 82 | // into the latest Snapshot; if storage only contains the dummy entry the |
| 83 | // first log entry is not available). |
| 84 | FirstIndex() (uint64, error) |
| 85 | // Snapshot returns the most recent snapshot. |
| 86 | // If snapshot is temporarily unavailable, it should return ErrSnapshotTemporarilyUnavailable, |
| 87 | // so raft state machine could know that Storage needs some time to prepare |
| 88 | // snapshot and call Snapshot later. |
| 89 | Snapshot() (pb.Snapshot, error) |
| 90 | } |
| 91 | |
| 92 | type inMemStorageCallStats struct { |
| 93 | initialState, firstIndex, lastIndex, entries, term, snapshot int |
| 94 | } |
| 95 | |
| 96 | // MemoryStorage implements the Storage interface backed by an |
| 97 | // in-memory array. |
| 98 | type MemoryStorage struct { |
| 99 | // Protects access to all fields. Most methods of MemoryStorage are |
| 100 | // run on the raft goroutine, but Append() is run on an application |
| 101 | // goroutine. |
| 102 | sync.Mutex |
| 103 | |
| 104 | hardState pb.HardState |
| 105 | snapshot pb.Snapshot |
| 106 | // ents[i] has raft log position i+snapshot.Metadata.Index |
| 107 | ents []pb.Entry |
| 108 | |
| 109 | callStats inMemStorageCallStats |
| 110 | } |
| 111 | |
| 112 | // NewMemoryStorage creates an empty MemoryStorage. |
| 113 | func NewMemoryStorage() *MemoryStorage { |
| 114 | return &MemoryStorage{ |
| 115 | // When starting from scratch populate the list with a dummy entry at term zero. |
| 116 | ents: make([]pb.Entry, 1), |
| 117 | } |
| 118 | } |
| 119 | |
| 120 | // InitialState implements the Storage interface. |
| 121 | func (ms *MemoryStorage) InitialState() (pb.HardState, pb.ConfState, error) { |
| 122 | ms.callStats.initialState++ |
| 123 | return ms.hardState, ms.snapshot.Metadata.ConfState, nil |
| 124 | } |
| 125 | |
| 126 | // SetHardState saves the current HardState. |
| 127 | func (ms *MemoryStorage) SetHardState(st pb.HardState) error { |
| 128 | ms.Lock() |
| 129 | defer ms.Unlock() |
| 130 | ms.hardState = st |
| 131 | return nil |
| 132 | } |
| 133 | |
| 134 | // Entries implements the Storage interface. |
| 135 | func (ms *MemoryStorage) Entries(lo, hi, maxSize uint64) ([]pb.Entry, error) { |
| 136 | ms.Lock() |
| 137 | defer ms.Unlock() |
| 138 | ms.callStats.entries++ |
| 139 | offset := ms.ents[0].Index |
| 140 | if lo <= offset { |
| 141 | return nil, ErrCompacted |
| 142 | } |
| 143 | if hi > ms.lastIndex()+1 { |
| 144 | getLogger().Panicf("entries' hi(%d) is out of bound lastindex(%d)", hi, ms.lastIndex()) |
| 145 | } |
| 146 | // only contains dummy entries. |
| 147 | if len(ms.ents) == 1 { |
| 148 | return nil, ErrUnavailable |
| 149 | } |
| 150 | |
| 151 | ents := limitSize(ms.ents[lo-offset:hi-offset], entryEncodingSize(maxSize)) |
| 152 | // NB: use the full slice expression to limit what the caller can do with the |
| 153 | // returned slice. For example, an append will reallocate and copy this slice |
| 154 | // instead of corrupting the neighbouring ms.ents. |
| 155 | return ents[:len(ents):len(ents)], nil |
| 156 | } |
| 157 | |
| 158 | // Term implements the Storage interface. |
| 159 | func (ms *MemoryStorage) Term(i uint64) (uint64, error) { |
| 160 | ms.Lock() |
| 161 | defer ms.Unlock() |
| 162 | ms.callStats.term++ |
| 163 | offset := ms.ents[0].Index |
| 164 | if i < offset { |
| 165 | return 0, ErrCompacted |
| 166 | } |
| 167 | if int(i-offset) >= len(ms.ents) { |
| 168 | return 0, ErrUnavailable |
| 169 | } |
| 170 | return ms.ents[i-offset].Term, nil |
| 171 | } |
| 172 | |
| 173 | // LastIndex implements the Storage interface. |
| 174 | func (ms *MemoryStorage) LastIndex() (uint64, error) { |
| 175 | ms.Lock() |
| 176 | defer ms.Unlock() |
| 177 | ms.callStats.lastIndex++ |
| 178 | return ms.lastIndex(), nil |
| 179 | } |
| 180 | |
| 181 | func (ms *MemoryStorage) lastIndex() uint64 { |
| 182 | return ms.ents[0].Index + uint64(len(ms.ents)) - 1 |
| 183 | } |
| 184 | |
| 185 | // FirstIndex implements the Storage interface. |
| 186 | func (ms *MemoryStorage) FirstIndex() (uint64, error) { |
| 187 | ms.Lock() |
| 188 | defer ms.Unlock() |
| 189 | ms.callStats.firstIndex++ |
| 190 | return ms.firstIndex(), nil |
| 191 | } |
| 192 | |
| 193 | func (ms *MemoryStorage) firstIndex() uint64 { |
| 194 | return ms.ents[0].Index + 1 |
| 195 | } |
| 196 | |
| 197 | // Snapshot implements the Storage interface. |
| 198 | func (ms *MemoryStorage) Snapshot() (pb.Snapshot, error) { |
| 199 | ms.Lock() |
| 200 | defer ms.Unlock() |
| 201 | ms.callStats.snapshot++ |
| 202 | return ms.snapshot, nil |
| 203 | } |
| 204 | |
| 205 | // ApplySnapshot overwrites the contents of this Storage object with |
| 206 | // those of the given snapshot. |
| 207 | func (ms *MemoryStorage) ApplySnapshot(snap pb.Snapshot) error { |
| 208 | ms.Lock() |
| 209 | defer ms.Unlock() |
| 210 | |
| 211 | //handle check for old snapshot being applied |
| 212 | msIndex := ms.snapshot.Metadata.Index |
| 213 | snapIndex := snap.Metadata.Index |
| 214 | if msIndex >= snapIndex { |
| 215 | return ErrSnapOutOfDate |
| 216 | } |
| 217 | |
| 218 | ms.snapshot = snap |
| 219 | ms.ents = []pb.Entry{{Term: snap.Metadata.Term, Index: snap.Metadata.Index}} |
| 220 | return nil |
| 221 | } |
| 222 | |
| 223 | // CreateSnapshot makes a snapshot which can be retrieved with Snapshot() and |
| 224 | // can be used to reconstruct the state at that point. |
| 225 | // If any configuration changes have been made since the last compaction, |
| 226 | // the result of the last ApplyConfChange must be passed in. |
| 227 | func (ms *MemoryStorage) CreateSnapshot(i uint64, cs *pb.ConfState, data []byte) (pb.Snapshot, error) { |
| 228 | ms.Lock() |
| 229 | defer ms.Unlock() |
| 230 | if i <= ms.snapshot.Metadata.Index { |
| 231 | return pb.Snapshot{}, ErrSnapOutOfDate |
| 232 | } |
| 233 | |
| 234 | offset := ms.ents[0].Index |
| 235 | if i > ms.lastIndex() { |
| 236 | getLogger().Panicf("snapshot %d is out of bound lastindex(%d)", i, ms.lastIndex()) |
| 237 | } |
| 238 | |
| 239 | ms.snapshot.Metadata.Index = i |
| 240 | ms.snapshot.Metadata.Term = ms.ents[i-offset].Term |
| 241 | if cs != nil { |
| 242 | ms.snapshot.Metadata.ConfState = *cs |
| 243 | } |
| 244 | ms.snapshot.Data = data |
| 245 | return ms.snapshot, nil |
| 246 | } |
| 247 | |
| 248 | // Compact discards all log entries prior to compactIndex. |
| 249 | // It is the application's responsibility to not attempt to compact an index |
| 250 | // greater than raftLog.applied. |
| 251 | func (ms *MemoryStorage) Compact(compactIndex uint64) error { |
| 252 | ms.Lock() |
| 253 | defer ms.Unlock() |
| 254 | offset := ms.ents[0].Index |
| 255 | if compactIndex <= offset { |
| 256 | return ErrCompacted |
| 257 | } |
| 258 | if compactIndex > ms.lastIndex() { |
| 259 | getLogger().Panicf("compact %d is out of bound lastindex(%d)", compactIndex, ms.lastIndex()) |
| 260 | } |
| 261 | |
| 262 | i := compactIndex - offset |
| 263 | // NB: allocate a new slice instead of reusing the old ms.ents. Entries in |
| 264 | // ms.ents are immutable, and can be referenced from outside MemoryStorage |
| 265 | // through slices returned by ms.Entries(). |
| 266 | ents := make([]pb.Entry, 1, uint64(len(ms.ents))-i) |
| 267 | ents[0].Index = ms.ents[i].Index |
| 268 | ents[0].Term = ms.ents[i].Term |
| 269 | ents = append(ents, ms.ents[i+1:]...) |
| 270 | ms.ents = ents |
| 271 | return nil |
| 272 | } |
| 273 | |
| 274 | // Append the new entries to storage. |
| 275 | // TODO (xiangli): ensure the entries are continuous and |
| 276 | // entries[0].Index > ms.entries[0].Index |
| 277 | func (ms *MemoryStorage) Append(entries []pb.Entry) error { |
| 278 | if len(entries) == 0 { |
| 279 | return nil |
| 280 | } |
| 281 | |
| 282 | ms.Lock() |
| 283 | defer ms.Unlock() |
| 284 | |
| 285 | first := ms.firstIndex() |
| 286 | last := entries[0].Index + uint64(len(entries)) - 1 |
| 287 | |
| 288 | // shortcut if there is no new entry. |
| 289 | if last < first { |
| 290 | return nil |
| 291 | } |
| 292 | // truncate compacted entries |
| 293 | if first > entries[0].Index { |
| 294 | entries = entries[first-entries[0].Index:] |
| 295 | } |
| 296 | |
| 297 | offset := entries[0].Index - ms.ents[0].Index |
| 298 | switch { |
| 299 | case uint64(len(ms.ents)) > offset: |
| 300 | // NB: full slice expression protects ms.ents at index >= offset from |
| 301 | // rewrites, as they may still be referenced from outside MemoryStorage. |
| 302 | ms.ents = append(ms.ents[:offset:offset], entries...) |
| 303 | case uint64(len(ms.ents)) == offset: |
| 304 | ms.ents = append(ms.ents, entries...) |
| 305 | default: |
| 306 | getLogger().Panicf("missing log entry [last: %d, append at: %d]", |
| 307 | ms.lastIndex(), entries[0].Index) |
| 308 | } |
| 309 | return nil |
| 310 | } |