blob: 3bf994b7a9f2978d8d137d2ee362467ae4f1e409 [file] [log] [blame]
Abhay Kumara2ae5992025-11-10 14:02:24 +00001// Copyright 2015 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package config
16
17import (
18 "context"
19 "fmt"
20 "path/filepath"
21 "sort"
22 "strings"
23 "time"
24
25 "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
26 "go.uber.org/zap"
27
28 bolt "go.etcd.io/bbolt"
29 "go.etcd.io/etcd/client/pkg/v3/transport"
30 "go.etcd.io/etcd/client/pkg/v3/types"
31 "go.etcd.io/etcd/pkg/v3/featuregate"
32 "go.etcd.io/etcd/pkg/v3/netutil"
33 "go.etcd.io/etcd/server/v3/etcdserver/api/v3discovery"
34 "go.etcd.io/etcd/server/v3/storage/datadir"
35)
36
37const (
38 grpcOverheadBytes = 512 * 1024
39)
40
41// ServerConfig holds the configuration of etcd as taken from the command line or discovery.
42type ServerConfig struct {
43 Name string
44
45 DiscoveryURL string
46 DiscoveryProxy string
47 DiscoveryCfg v3discovery.DiscoveryConfig
48
49 ClientURLs types.URLs
50 PeerURLs types.URLs
51 DataDir string
52 // DedicatedWALDir config will make the etcd to write the WAL to the WALDir
53 // rather than the dataDir/member/wal.
54 DedicatedWALDir string
55
56 SnapshotCount uint64
57
58 // SnapshotCatchUpEntries is the number of entries for a slow follower
59 // to catch-up after compacting the raft storage entries.
60 // We expect the follower has a millisecond level latency with the leader.
61 // The max throughput is around 10K. Keep a 5K entries is enough for helping
62 // follower to catch up.
63 SnapshotCatchUpEntries uint64
64
65 MaxSnapFiles uint
66 MaxWALFiles uint
67
68 // BackendBatchInterval is the maximum time before commit the backend transaction.
69 BackendBatchInterval time.Duration
70 // BackendBatchLimit is the maximum operations before commit the backend transaction.
71 BackendBatchLimit int
72
73 // BackendFreelistType is the type of the backend boltdb freelist.
74 BackendFreelistType bolt.FreelistType
75
76 InitialPeerURLsMap types.URLsMap
77 InitialClusterToken string
78 NewCluster bool
79 PeerTLSInfo transport.TLSInfo
80
81 CORS map[string]struct{}
82
83 // HostWhitelist lists acceptable hostnames from client requests.
84 // If server is insecure (no TLS), server only accepts requests
85 // whose Host header value exists in this white list.
86 HostWhitelist map[string]struct{}
87
88 TickMs uint
89 ElectionTicks int
90
91 // InitialElectionTickAdvance is true, then local member fast-forwards
92 // election ticks to speed up "initial" leader election trigger. This
93 // benefits the case of larger election ticks. For instance, cross
94 // datacenter deployment may require longer election timeout of 10-second.
95 // If true, local node does not need wait up to 10-second. Instead,
96 // forwards its election ticks to 8-second, and have only 2-second left
97 // before leader election.
98 //
99 // Major assumptions are that:
100 // - cluster has no active leader thus advancing ticks enables faster
101 // leader election, or
102 // - cluster already has an established leader, and rejoining follower
103 // is likely to receive heartbeats from the leader after tick advance
104 // and before election timeout.
105 //
106 // However, when network from leader to rejoining follower is congested,
107 // and the follower does not receive leader heartbeat within left election
108 // ticks, disruptive election has to happen thus affecting cluster
109 // availabilities.
110 //
111 // Disabling this would slow down initial bootstrap process for cross
112 // datacenter deployments. Make your own tradeoffs by configuring
113 // --initial-election-tick-advance at the cost of slow initial bootstrap.
114 //
115 // If single-node, it advances ticks regardless.
116 //
117 // See https://github.com/etcd-io/etcd/issues/9333 for more detail.
118 InitialElectionTickAdvance bool
119
120 BootstrapTimeout time.Duration
121
122 AutoCompactionRetention time.Duration
123 AutoCompactionMode string
124 CompactionBatchLimit int
125 CompactionSleepInterval time.Duration
126 QuotaBackendBytes int64
127 MaxTxnOps uint
128
129 // MaxRequestBytes is the maximum request size to send over raft.
130 MaxRequestBytes uint
131
132 // MaxConcurrentStreams specifies the maximum number of concurrent
133 // streams that each client can open at a time.
134 MaxConcurrentStreams uint32
135
136 WarningApplyDuration time.Duration
137 WarningUnaryRequestDuration time.Duration
138
139 StrictReconfigCheck bool
140
141 // ClientCertAuthEnabled is true when cert has been signed by the client CA.
142 ClientCertAuthEnabled bool
143
144 AuthToken string
145 BcryptCost uint
146 TokenTTL uint
147
148 // InitialCorruptCheck is true to check data corruption on boot
149 // before serving any peer/client traffic.
150 InitialCorruptCheck bool
151 CorruptCheckTime time.Duration
152 CompactHashCheckTime time.Duration
153
154 // PreVote is true to enable Raft Pre-Vote.
155 PreVote bool
156
157 // SocketOpts are socket options passed to listener config.
158 SocketOpts transport.SocketOpts
159
160 // Logger logs server-side operations.
161 Logger *zap.Logger
162
163 ForceNewCluster bool
164
165 // LeaseCheckpointInterval time.Duration is the wait duration between lease checkpoints.
166 LeaseCheckpointInterval time.Duration
167
168 EnableGRPCGateway bool
169
170 // EnableDistributedTracing enables distributed tracing using OpenTelemetry protocol.
171 EnableDistributedTracing bool
172 // TracerOptions are options for OpenTelemetry gRPC interceptor.
173 TracerOptions []otelgrpc.Option
174
175 WatchProgressNotifyInterval time.Duration
176
177 // UnsafeNoFsync disables all uses of fsync.
178 // Setting this is unsafe and will cause data loss.
179 UnsafeNoFsync bool `json:"unsafe-no-fsync"`
180
181 DowngradeCheckTime time.Duration
182
183 // MemoryMlock enables mlocking of etcd owned memory pages.
184 // The setting improves etcd tail latency in environments were:
185 // - memory pressure might lead to swapping pages to disk
186 // - disk latency might be unstable
187 // Currently all etcd memory gets mlocked, but in future the flag can
188 // be refined to mlock in-use area of bbolt only.
189 MemoryMlock bool `json:"memory-mlock"`
190
191 // ExperimentalTxnModeWriteWithSharedBuffer enable write transaction to use
192 // a shared buffer in its readonly check operations.
193 // TODO: Delete in v3.7
194 // Deprecated: Use TxnModeWriteWithSharedBuffer Feature Gate instead. Will be decommissioned in v3.7.
195 ExperimentalTxnModeWriteWithSharedBuffer bool `json:"experimental-txn-mode-write-with-shared-buffer"`
196
197 // BootstrapDefragThresholdMegabytes is the minimum number of megabytes needed to be freed for etcd server to
198 // consider running defrag during bootstrap. Needs to be set to non-zero value to take effect.
199 BootstrapDefragThresholdMegabytes uint `json:"bootstrap-defrag-threshold-megabytes"`
200
201 // MaxLearners sets a limit to the number of learner members that can exist in the cluster membership.
202 MaxLearners int `json:"max-learners"`
203
204 // V2Deprecation defines a phase of v2store deprecation process.
205 V2Deprecation V2DeprecationEnum `json:"v2-deprecation"`
206
207 // ExperimentalLocalAddress is the local IP address to use when communicating with a peer.
208 ExperimentalLocalAddress string `json:"experimental-local-address"`
209
210 // ServerFeatureGate is a server level feature gate
211 ServerFeatureGate featuregate.FeatureGate
212
213 // Metrics types of metrics - should be either 'basic' or 'extensive'
214 Metrics string
215}
216
217// VerifyBootstrap sanity-checks the initial config for bootstrap case
218// and returns an error for things that should never happen.
219func (c *ServerConfig) VerifyBootstrap() error {
220 if err := c.hasLocalMember(); err != nil {
221 return err
222 }
223 if err := c.advertiseMatchesCluster(); err != nil {
224 return err
225 }
226 if CheckDuplicateURL(c.InitialPeerURLsMap) {
227 return fmt.Errorf("initial cluster %s has duplicate url", c.InitialPeerURLsMap)
228 }
229 if c.InitialPeerURLsMap.String() == "" && c.DiscoveryURL == "" {
230 return fmt.Errorf("initial cluster unset and no discovery URL found")
231 }
232 return nil
233}
234
235// VerifyJoinExisting sanity-checks the initial config for join existing cluster
236// case and returns an error for things that should never happen.
237func (c *ServerConfig) VerifyJoinExisting() error {
238 // The member has announced its peer urls to the cluster before starting; no need to
239 // set the configuration again.
240 if err := c.hasLocalMember(); err != nil {
241 return err
242 }
243 if CheckDuplicateURL(c.InitialPeerURLsMap) {
244 return fmt.Errorf("initial cluster %s has duplicate url", c.InitialPeerURLsMap)
245 }
246 if c.DiscoveryURL != "" {
247 return fmt.Errorf("discovery URL should not be set when joining existing initial cluster")
248 }
249 return nil
250}
251
252// hasLocalMember checks that the cluster at least contains the local server.
253func (c *ServerConfig) hasLocalMember() error {
254 if urls := c.InitialPeerURLsMap[c.Name]; urls == nil {
255 return fmt.Errorf("couldn't find local name %q in the initial cluster configuration", c.Name)
256 }
257 return nil
258}
259
260// advertiseMatchesCluster confirms peer URLs match those in the cluster peer list.
261func (c *ServerConfig) advertiseMatchesCluster() error {
262 urls, apurls := c.InitialPeerURLsMap[c.Name], c.PeerURLs.StringSlice()
263 urls.Sort()
264 sort.Strings(apurls)
265 ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second)
266 defer cancel()
267 ok, err := netutil.URLStringsEqual(ctx, c.Logger, apurls, urls.StringSlice())
268 if ok {
269 return nil
270 }
271
272 initMap, apMap := make(map[string]struct{}), make(map[string]struct{})
273 for _, url := range c.PeerURLs {
274 apMap[url.String()] = struct{}{}
275 }
276 for _, url := range c.InitialPeerURLsMap[c.Name] {
277 initMap[url.String()] = struct{}{}
278 }
279
280 var missing []string
281 for url := range initMap {
282 if _, ok := apMap[url]; !ok {
283 missing = append(missing, url)
284 }
285 }
286 if len(missing) > 0 {
287 for i := range missing {
288 missing[i] = c.Name + "=" + missing[i]
289 }
290 mstr := strings.Join(missing, ",")
291 apStr := strings.Join(apurls, ",")
292 return fmt.Errorf("--initial-cluster has %s but missing from --initial-advertise-peer-urls=%s (%w)", mstr, apStr, err)
293 }
294
295 for url := range apMap {
296 if _, ok := initMap[url]; !ok {
297 missing = append(missing, url)
298 }
299 }
300 if len(missing) > 0 {
301 mstr := strings.Join(missing, ",")
302 umap := types.URLsMap(map[string]types.URLs{c.Name: c.PeerURLs})
303 return fmt.Errorf("--initial-advertise-peer-urls has %s but missing from --initial-cluster=%s", mstr, umap.String())
304 }
305
306 // resolved URLs from "--initial-advertise-peer-urls" and "--initial-cluster" did not match or failed
307 apStr := strings.Join(apurls, ",")
308 umap := types.URLsMap(map[string]types.URLs{c.Name: c.PeerURLs})
309 return fmt.Errorf("failed to resolve %s to match --initial-cluster=%s (%w)", apStr, umap.String(), err)
310}
311
312func (c *ServerConfig) MemberDir() string { return datadir.ToMemberDir(c.DataDir) }
313
314func (c *ServerConfig) WALDir() string {
315 if c.DedicatedWALDir != "" {
316 return c.DedicatedWALDir
317 }
318 return datadir.ToWALDir(c.DataDir)
319}
320
321func (c *ServerConfig) SnapDir() string { return filepath.Join(c.MemberDir(), "snap") }
322
323func (c *ServerConfig) ShouldDiscover() bool {
324 return c.DiscoveryURL != "" || len(c.DiscoveryCfg.Endpoints) > 0
325}
326
327// ReqTimeout returns timeout for request to finish.
328func (c *ServerConfig) ReqTimeout() time.Duration {
329 // 5s for queue waiting, computation and disk IO delay
330 // + 2 * election timeout for possible leader election
331 return 5*time.Second + 2*time.Duration(c.ElectionTicks*int(c.TickMs))*time.Millisecond
332}
333
334func (c *ServerConfig) ElectionTimeout() time.Duration {
335 return time.Duration(c.ElectionTicks*int(c.TickMs)) * time.Millisecond
336}
337
338func (c *ServerConfig) PeerDialTimeout() time.Duration {
339 // 1s for queue wait and election timeout
340 return time.Second + time.Duration(c.ElectionTicks*int(c.TickMs))*time.Millisecond
341}
342
343func CheckDuplicateURL(urlsmap types.URLsMap) bool {
344 um := make(map[string]bool)
345 for _, urls := range urlsmap {
346 for _, url := range urls {
347 u := url.String()
348 if um[u] {
349 return true
350 }
351 um[u] = true
352 }
353 }
354 return false
355}
356
357func (c *ServerConfig) BootstrapTimeoutEffective() time.Duration {
358 if c.BootstrapTimeout != 0 {
359 return c.BootstrapTimeout
360 }
361 return time.Second
362}
363
364func (c *ServerConfig) BackendPath() string { return datadir.ToBackendFileName(c.DataDir) }
365
366func (c *ServerConfig) MaxRequestBytesWithOverhead() uint {
367 return c.MaxRequestBytes + grpcOverheadBytes
368}