| Abhay Kumar | 40252eb | 2025-10-13 13:25:53 +0000 | [diff] [blame^] | 1 | // Copyright 2015 The etcd Authors |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | package config |
| 16 | |
| 17 | import ( |
| 18 | "context" |
| 19 | "fmt" |
| 20 | "path/filepath" |
| 21 | "sort" |
| 22 | "strings" |
| 23 | "time" |
| 24 | |
| 25 | "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" |
| 26 | "go.uber.org/zap" |
| 27 | |
| 28 | bolt "go.etcd.io/bbolt" |
| 29 | "go.etcd.io/etcd/client/pkg/v3/transport" |
| 30 | "go.etcd.io/etcd/client/pkg/v3/types" |
| 31 | "go.etcd.io/etcd/pkg/v3/featuregate" |
| 32 | "go.etcd.io/etcd/pkg/v3/netutil" |
| 33 | "go.etcd.io/etcd/server/v3/etcdserver/api/v3discovery" |
| 34 | "go.etcd.io/etcd/server/v3/storage/datadir" |
| 35 | ) |
| 36 | |
| 37 | const ( |
| 38 | grpcOverheadBytes = 512 * 1024 |
| 39 | ) |
| 40 | |
| 41 | // ServerConfig holds the configuration of etcd as taken from the command line or discovery. |
| 42 | type ServerConfig struct { |
| 43 | Name string |
| 44 | |
| 45 | DiscoveryURL string |
| 46 | DiscoveryProxy string |
| 47 | DiscoveryCfg v3discovery.DiscoveryConfig |
| 48 | |
| 49 | ClientURLs types.URLs |
| 50 | PeerURLs types.URLs |
| 51 | DataDir string |
| 52 | // DedicatedWALDir config will make the etcd to write the WAL to the WALDir |
| 53 | // rather than the dataDir/member/wal. |
| 54 | DedicatedWALDir string |
| 55 | |
| 56 | SnapshotCount uint64 |
| 57 | |
| 58 | // SnapshotCatchUpEntries is the number of entries for a slow follower |
| 59 | // to catch-up after compacting the raft storage entries. |
| 60 | // We expect the follower has a millisecond level latency with the leader. |
| 61 | // The max throughput is around 10K. Keep a 5K entries is enough for helping |
| 62 | // follower to catch up. |
| 63 | SnapshotCatchUpEntries uint64 |
| 64 | |
| 65 | MaxSnapFiles uint |
| 66 | MaxWALFiles uint |
| 67 | |
| 68 | // BackendBatchInterval is the maximum time before commit the backend transaction. |
| 69 | BackendBatchInterval time.Duration |
| 70 | // BackendBatchLimit is the maximum operations before commit the backend transaction. |
| 71 | BackendBatchLimit int |
| 72 | |
| 73 | // BackendFreelistType is the type of the backend boltdb freelist. |
| 74 | BackendFreelistType bolt.FreelistType |
| 75 | |
| 76 | InitialPeerURLsMap types.URLsMap |
| 77 | InitialClusterToken string |
| 78 | NewCluster bool |
| 79 | PeerTLSInfo transport.TLSInfo |
| 80 | |
| 81 | CORS map[string]struct{} |
| 82 | |
| 83 | // HostWhitelist lists acceptable hostnames from client requests. |
| 84 | // If server is insecure (no TLS), server only accepts requests |
| 85 | // whose Host header value exists in this white list. |
| 86 | HostWhitelist map[string]struct{} |
| 87 | |
| 88 | TickMs uint |
| 89 | ElectionTicks int |
| 90 | |
| 91 | // InitialElectionTickAdvance is true, then local member fast-forwards |
| 92 | // election ticks to speed up "initial" leader election trigger. This |
| 93 | // benefits the case of larger election ticks. For instance, cross |
| 94 | // datacenter deployment may require longer election timeout of 10-second. |
| 95 | // If true, local node does not need wait up to 10-second. Instead, |
| 96 | // forwards its election ticks to 8-second, and have only 2-second left |
| 97 | // before leader election. |
| 98 | // |
| 99 | // Major assumptions are that: |
| 100 | // - cluster has no active leader thus advancing ticks enables faster |
| 101 | // leader election, or |
| 102 | // - cluster already has an established leader, and rejoining follower |
| 103 | // is likely to receive heartbeats from the leader after tick advance |
| 104 | // and before election timeout. |
| 105 | // |
| 106 | // However, when network from leader to rejoining follower is congested, |
| 107 | // and the follower does not receive leader heartbeat within left election |
| 108 | // ticks, disruptive election has to happen thus affecting cluster |
| 109 | // availabilities. |
| 110 | // |
| 111 | // Disabling this would slow down initial bootstrap process for cross |
| 112 | // datacenter deployments. Make your own tradeoffs by configuring |
| 113 | // --initial-election-tick-advance at the cost of slow initial bootstrap. |
| 114 | // |
| 115 | // If single-node, it advances ticks regardless. |
| 116 | // |
| 117 | // See https://github.com/etcd-io/etcd/issues/9333 for more detail. |
| 118 | InitialElectionTickAdvance bool |
| 119 | |
| 120 | BootstrapTimeout time.Duration |
| 121 | |
| 122 | AutoCompactionRetention time.Duration |
| 123 | AutoCompactionMode string |
| 124 | CompactionBatchLimit int |
| 125 | CompactionSleepInterval time.Duration |
| 126 | QuotaBackendBytes int64 |
| 127 | MaxTxnOps uint |
| 128 | |
| 129 | // MaxRequestBytes is the maximum request size to send over raft. |
| 130 | MaxRequestBytes uint |
| 131 | |
| 132 | // MaxConcurrentStreams specifies the maximum number of concurrent |
| 133 | // streams that each client can open at a time. |
| 134 | MaxConcurrentStreams uint32 |
| 135 | |
| 136 | WarningApplyDuration time.Duration |
| 137 | WarningUnaryRequestDuration time.Duration |
| 138 | |
| 139 | StrictReconfigCheck bool |
| 140 | |
| 141 | // ClientCertAuthEnabled is true when cert has been signed by the client CA. |
| 142 | ClientCertAuthEnabled bool |
| 143 | |
| 144 | AuthToken string |
| 145 | BcryptCost uint |
| 146 | TokenTTL uint |
| 147 | |
| 148 | // InitialCorruptCheck is true to check data corruption on boot |
| 149 | // before serving any peer/client traffic. |
| 150 | InitialCorruptCheck bool |
| 151 | CorruptCheckTime time.Duration |
| 152 | CompactHashCheckTime time.Duration |
| 153 | |
| 154 | // PreVote is true to enable Raft Pre-Vote. |
| 155 | PreVote bool |
| 156 | |
| 157 | // SocketOpts are socket options passed to listener config. |
| 158 | SocketOpts transport.SocketOpts |
| 159 | |
| 160 | // Logger logs server-side operations. |
| 161 | Logger *zap.Logger |
| 162 | |
| 163 | ForceNewCluster bool |
| 164 | |
| 165 | // LeaseCheckpointInterval time.Duration is the wait duration between lease checkpoints. |
| 166 | LeaseCheckpointInterval time.Duration |
| 167 | |
| 168 | EnableGRPCGateway bool |
| 169 | |
| 170 | // EnableDistributedTracing enables distributed tracing using OpenTelemetry protocol. |
| 171 | EnableDistributedTracing bool |
| 172 | // TracerOptions are options for OpenTelemetry gRPC interceptor. |
| 173 | TracerOptions []otelgrpc.Option |
| 174 | |
| 175 | WatchProgressNotifyInterval time.Duration |
| 176 | |
| 177 | // UnsafeNoFsync disables all uses of fsync. |
| 178 | // Setting this is unsafe and will cause data loss. |
| 179 | UnsafeNoFsync bool `json:"unsafe-no-fsync"` |
| 180 | |
| 181 | DowngradeCheckTime time.Duration |
| 182 | |
| 183 | // MemoryMlock enables mlocking of etcd owned memory pages. |
| 184 | // The setting improves etcd tail latency in environments were: |
| 185 | // - memory pressure might lead to swapping pages to disk |
| 186 | // - disk latency might be unstable |
| 187 | // Currently all etcd memory gets mlocked, but in future the flag can |
| 188 | // be refined to mlock in-use area of bbolt only. |
| 189 | MemoryMlock bool `json:"memory-mlock"` |
| 190 | |
| 191 | // ExperimentalTxnModeWriteWithSharedBuffer enable write transaction to use |
| 192 | // a shared buffer in its readonly check operations. |
| 193 | // TODO: Delete in v3.7 |
| 194 | // Deprecated: Use TxnModeWriteWithSharedBuffer Feature Gate instead. Will be decommissioned in v3.7. |
| 195 | ExperimentalTxnModeWriteWithSharedBuffer bool `json:"experimental-txn-mode-write-with-shared-buffer"` |
| 196 | |
| 197 | // BootstrapDefragThresholdMegabytes is the minimum number of megabytes needed to be freed for etcd server to |
| 198 | // consider running defrag during bootstrap. Needs to be set to non-zero value to take effect. |
| 199 | BootstrapDefragThresholdMegabytes uint `json:"bootstrap-defrag-threshold-megabytes"` |
| 200 | |
| 201 | // MaxLearners sets a limit to the number of learner members that can exist in the cluster membership. |
| 202 | MaxLearners int `json:"max-learners"` |
| 203 | |
| 204 | // V2Deprecation defines a phase of v2store deprecation process. |
| 205 | V2Deprecation V2DeprecationEnum `json:"v2-deprecation"` |
| 206 | |
| 207 | // ExperimentalLocalAddress is the local IP address to use when communicating with a peer. |
| 208 | ExperimentalLocalAddress string `json:"experimental-local-address"` |
| 209 | |
| 210 | // ServerFeatureGate is a server level feature gate |
| 211 | ServerFeatureGate featuregate.FeatureGate |
| 212 | |
| 213 | // Metrics types of metrics - should be either 'basic' or 'extensive' |
| 214 | Metrics string |
| 215 | } |
| 216 | |
| 217 | // VerifyBootstrap sanity-checks the initial config for bootstrap case |
| 218 | // and returns an error for things that should never happen. |
| 219 | func (c *ServerConfig) VerifyBootstrap() error { |
| 220 | if err := c.hasLocalMember(); err != nil { |
| 221 | return err |
| 222 | } |
| 223 | if err := c.advertiseMatchesCluster(); err != nil { |
| 224 | return err |
| 225 | } |
| 226 | if CheckDuplicateURL(c.InitialPeerURLsMap) { |
| 227 | return fmt.Errorf("initial cluster %s has duplicate url", c.InitialPeerURLsMap) |
| 228 | } |
| 229 | if c.InitialPeerURLsMap.String() == "" && c.DiscoveryURL == "" { |
| 230 | return fmt.Errorf("initial cluster unset and no discovery URL found") |
| 231 | } |
| 232 | return nil |
| 233 | } |
| 234 | |
| 235 | // VerifyJoinExisting sanity-checks the initial config for join existing cluster |
| 236 | // case and returns an error for things that should never happen. |
| 237 | func (c *ServerConfig) VerifyJoinExisting() error { |
| 238 | // The member has announced its peer urls to the cluster before starting; no need to |
| 239 | // set the configuration again. |
| 240 | if err := c.hasLocalMember(); err != nil { |
| 241 | return err |
| 242 | } |
| 243 | if CheckDuplicateURL(c.InitialPeerURLsMap) { |
| 244 | return fmt.Errorf("initial cluster %s has duplicate url", c.InitialPeerURLsMap) |
| 245 | } |
| 246 | if c.DiscoveryURL != "" { |
| 247 | return fmt.Errorf("discovery URL should not be set when joining existing initial cluster") |
| 248 | } |
| 249 | return nil |
| 250 | } |
| 251 | |
| 252 | // hasLocalMember checks that the cluster at least contains the local server. |
| 253 | func (c *ServerConfig) hasLocalMember() error { |
| 254 | if urls := c.InitialPeerURLsMap[c.Name]; urls == nil { |
| 255 | return fmt.Errorf("couldn't find local name %q in the initial cluster configuration", c.Name) |
| 256 | } |
| 257 | return nil |
| 258 | } |
| 259 | |
| 260 | // advertiseMatchesCluster confirms peer URLs match those in the cluster peer list. |
| 261 | func (c *ServerConfig) advertiseMatchesCluster() error { |
| 262 | urls, apurls := c.InitialPeerURLsMap[c.Name], c.PeerURLs.StringSlice() |
| 263 | urls.Sort() |
| 264 | sort.Strings(apurls) |
| 265 | ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) |
| 266 | defer cancel() |
| 267 | ok, err := netutil.URLStringsEqual(ctx, c.Logger, apurls, urls.StringSlice()) |
| 268 | if ok { |
| 269 | return nil |
| 270 | } |
| 271 | |
| 272 | initMap, apMap := make(map[string]struct{}), make(map[string]struct{}) |
| 273 | for _, url := range c.PeerURLs { |
| 274 | apMap[url.String()] = struct{}{} |
| 275 | } |
| 276 | for _, url := range c.InitialPeerURLsMap[c.Name] { |
| 277 | initMap[url.String()] = struct{}{} |
| 278 | } |
| 279 | |
| 280 | var missing []string |
| 281 | for url := range initMap { |
| 282 | if _, ok := apMap[url]; !ok { |
| 283 | missing = append(missing, url) |
| 284 | } |
| 285 | } |
| 286 | if len(missing) > 0 { |
| 287 | for i := range missing { |
| 288 | missing[i] = c.Name + "=" + missing[i] |
| 289 | } |
| 290 | mstr := strings.Join(missing, ",") |
| 291 | apStr := strings.Join(apurls, ",") |
| 292 | return fmt.Errorf("--initial-cluster has %s but missing from --initial-advertise-peer-urls=%s (%w)", mstr, apStr, err) |
| 293 | } |
| 294 | |
| 295 | for url := range apMap { |
| 296 | if _, ok := initMap[url]; !ok { |
| 297 | missing = append(missing, url) |
| 298 | } |
| 299 | } |
| 300 | if len(missing) > 0 { |
| 301 | mstr := strings.Join(missing, ",") |
| 302 | umap := types.URLsMap(map[string]types.URLs{c.Name: c.PeerURLs}) |
| 303 | return fmt.Errorf("--initial-advertise-peer-urls has %s but missing from --initial-cluster=%s", mstr, umap.String()) |
| 304 | } |
| 305 | |
| 306 | // resolved URLs from "--initial-advertise-peer-urls" and "--initial-cluster" did not match or failed |
| 307 | apStr := strings.Join(apurls, ",") |
| 308 | umap := types.URLsMap(map[string]types.URLs{c.Name: c.PeerURLs}) |
| 309 | return fmt.Errorf("failed to resolve %s to match --initial-cluster=%s (%w)", apStr, umap.String(), err) |
| 310 | } |
| 311 | |
| 312 | func (c *ServerConfig) MemberDir() string { return datadir.ToMemberDir(c.DataDir) } |
| 313 | |
| 314 | func (c *ServerConfig) WALDir() string { |
| 315 | if c.DedicatedWALDir != "" { |
| 316 | return c.DedicatedWALDir |
| 317 | } |
| 318 | return datadir.ToWALDir(c.DataDir) |
| 319 | } |
| 320 | |
| 321 | func (c *ServerConfig) SnapDir() string { return filepath.Join(c.MemberDir(), "snap") } |
| 322 | |
| 323 | func (c *ServerConfig) ShouldDiscover() bool { |
| 324 | return c.DiscoveryURL != "" || len(c.DiscoveryCfg.Endpoints) > 0 |
| 325 | } |
| 326 | |
| 327 | // ReqTimeout returns timeout for request to finish. |
| 328 | func (c *ServerConfig) ReqTimeout() time.Duration { |
| 329 | // 5s for queue waiting, computation and disk IO delay |
| 330 | // + 2 * election timeout for possible leader election |
| 331 | return 5*time.Second + 2*time.Duration(c.ElectionTicks*int(c.TickMs))*time.Millisecond |
| 332 | } |
| 333 | |
| 334 | func (c *ServerConfig) ElectionTimeout() time.Duration { |
| 335 | return time.Duration(c.ElectionTicks*int(c.TickMs)) * time.Millisecond |
| 336 | } |
| 337 | |
| 338 | func (c *ServerConfig) PeerDialTimeout() time.Duration { |
| 339 | // 1s for queue wait and election timeout |
| 340 | return time.Second + time.Duration(c.ElectionTicks*int(c.TickMs))*time.Millisecond |
| 341 | } |
| 342 | |
| 343 | func CheckDuplicateURL(urlsmap types.URLsMap) bool { |
| 344 | um := make(map[string]bool) |
| 345 | for _, urls := range urlsmap { |
| 346 | for _, url := range urls { |
| 347 | u := url.String() |
| 348 | if um[u] { |
| 349 | return true |
| 350 | } |
| 351 | um[u] = true |
| 352 | } |
| 353 | } |
| 354 | return false |
| 355 | } |
| 356 | |
| 357 | func (c *ServerConfig) BootstrapTimeoutEffective() time.Duration { |
| 358 | if c.BootstrapTimeout != 0 { |
| 359 | return c.BootstrapTimeout |
| 360 | } |
| 361 | return time.Second |
| 362 | } |
| 363 | |
| 364 | func (c *ServerConfig) BackendPath() string { return datadir.ToBackendFileName(c.DataDir) } |
| 365 | |
| 366 | func (c *ServerConfig) MaxRequestBytesWithOverhead() uint { |
| 367 | return c.MaxRequestBytes + grpcOverheadBytes |
| 368 | } |