blob: 7176d30adbc0ae22ce5592e80b06556215dc3e14 [file] [log] [blame]
Abhay Kumar40252eb2025-10-13 13:25:53 +00001// Copyright 2015 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package etcdserver
16
17import (
18 goruntime "runtime"
19 "time"
20
21 "github.com/prometheus/client_golang/prometheus"
22 "go.uber.org/zap"
23
24 "go.etcd.io/etcd/api/v3/version"
25 "go.etcd.io/etcd/pkg/v3/runtime"
26)
27
28var (
29 hasLeader = prometheus.NewGauge(prometheus.GaugeOpts{
30 Namespace: "etcd",
31 Subsystem: "server",
32 Name: "has_leader",
33 Help: "Whether or not a leader exists. 1 is existence, 0 is not.",
34 })
35 isLeader = prometheus.NewGauge(prometheus.GaugeOpts{
36 Namespace: "etcd",
37 Subsystem: "server",
38 Name: "is_leader",
39 Help: "Whether or not this member is a leader. 1 if is, 0 otherwise.",
40 })
41 leaderChanges = prometheus.NewCounter(prometheus.CounterOpts{
42 Namespace: "etcd",
43 Subsystem: "server",
44 Name: "leader_changes_seen_total",
45 Help: "The number of leader changes seen.",
46 })
47 learnerPromoteFailed = prometheus.NewCounterVec(
48 prometheus.CounterOpts{
49 Namespace: "etcd",
50 Subsystem: "server",
51 Name: "learner_promote_failures",
52 Help: "The total number of failed learner promotions (likely learner not ready) while this member is leader.",
53 },
54 []string{"Reason"},
55 )
56 learnerPromoteSucceed = prometheus.NewCounter(prometheus.CounterOpts{
57 Namespace: "etcd",
58 Subsystem: "server",
59 Name: "learner_promote_successes",
60 Help: "The total number of successful learner promotions while this member is leader.",
61 })
62 heartbeatSendFailures = prometheus.NewCounter(prometheus.CounterOpts{
63 Namespace: "etcd",
64 Subsystem: "server",
65 Name: "heartbeat_send_failures_total",
66 Help: "The total number of leader heartbeat send failures (likely overloaded from slow disk).",
67 })
68 applySnapshotInProgress = prometheus.NewGauge(prometheus.GaugeOpts{
69 Namespace: "etcd",
70 Subsystem: "server",
71 Name: "snapshot_apply_in_progress_total",
72 Help: "1 if the server is applying the incoming snapshot. 0 if none.",
73 })
74 proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{
75 Namespace: "etcd",
76 Subsystem: "server",
77 Name: "proposals_committed_total",
78 Help: "The total number of consensus proposals committed.",
79 })
80 proposalsApplied = prometheus.NewGauge(prometheus.GaugeOpts{
81 Namespace: "etcd",
82 Subsystem: "server",
83 Name: "proposals_applied_total",
84 Help: "The total number of consensus proposals applied.",
85 })
86 proposalsPending = prometheus.NewGauge(prometheus.GaugeOpts{
87 Namespace: "etcd",
88 Subsystem: "server",
89 Name: "proposals_pending",
90 Help: "The current number of pending proposals to commit.",
91 })
92 proposalsFailed = prometheus.NewCounter(prometheus.CounterOpts{
93 Namespace: "etcd",
94 Subsystem: "server",
95 Name: "proposals_failed_total",
96 Help: "The total number of failed proposals seen.",
97 })
98 slowReadIndex = prometheus.NewCounter(prometheus.CounterOpts{
99 Namespace: "etcd",
100 Subsystem: "server",
101 Name: "slow_read_indexes_total",
102 Help: "The total number of pending read indexes not in sync with leader's or timed out read index requests.",
103 })
104 readIndexFailed = prometheus.NewCounter(prometheus.CounterOpts{
105 Namespace: "etcd",
106 Subsystem: "server",
107 Name: "read_indexes_failed_total",
108 Help: "The total number of failed read indexes seen.",
109 })
110 leaseExpired = prometheus.NewCounter(prometheus.CounterOpts{
111 Namespace: "etcd_debugging",
112 Subsystem: "server",
113 Name: "lease_expired_total",
114 Help: "The total number of expired leases.",
115 })
116 currentVersion = prometheus.NewGaugeVec(
117 prometheus.GaugeOpts{
118 Namespace: "etcd",
119 Subsystem: "server",
120 Name: "version",
121 Help: "Which version is running. 1 for 'server_version' label with current version.",
122 },
123 []string{"server_version"},
124 )
125 currentGoVersion = prometheus.NewGaugeVec(
126 prometheus.GaugeOpts{
127 Namespace: "etcd",
128 Subsystem: "server",
129 Name: "go_version",
130 Help: "Which Go version server is running with. 1 for 'server_go_version' label with current version.",
131 },
132 []string{"server_go_version"},
133 )
134 serverID = prometheus.NewGaugeVec(
135 prometheus.GaugeOpts{
136 Namespace: "etcd",
137 Subsystem: "server",
138 Name: "id",
139 Help: "Server or member ID in hexadecimal format. 1 for 'server_id' label with current ID.",
140 },
141 []string{"server_id"},
142 )
143 serverFeatureEnabled = prometheus.NewGaugeVec(
144 prometheus.GaugeOpts{
145 Name: "etcd_server_feature_enabled",
146 Help: "Whether or not a feature is enabled. 1 is enabled, 0 is not.",
147 },
148 []string{"name", "stage"},
149 )
150 fdUsed = prometheus.NewGauge(prometheus.GaugeOpts{
151 Namespace: "os",
152 Subsystem: "fd",
153 Name: "used",
154 Help: "The number of used file descriptors.",
155 })
156 fdLimit = prometheus.NewGauge(prometheus.GaugeOpts{
157 Namespace: "os",
158 Subsystem: "fd",
159 Name: "limit",
160 Help: "The file descriptor limit.",
161 })
162)
163
164func init() {
165 prometheus.MustRegister(hasLeader)
166 prometheus.MustRegister(isLeader)
167 prometheus.MustRegister(leaderChanges)
168 prometheus.MustRegister(heartbeatSendFailures)
169 prometheus.MustRegister(applySnapshotInProgress)
170 prometheus.MustRegister(proposalsCommitted)
171 prometheus.MustRegister(proposalsApplied)
172 prometheus.MustRegister(proposalsPending)
173 prometheus.MustRegister(proposalsFailed)
174 prometheus.MustRegister(slowReadIndex)
175 prometheus.MustRegister(readIndexFailed)
176 prometheus.MustRegister(leaseExpired)
177 prometheus.MustRegister(currentVersion)
178 prometheus.MustRegister(currentGoVersion)
179 prometheus.MustRegister(serverID)
180 prometheus.MustRegister(serverFeatureEnabled)
181 prometheus.MustRegister(learnerPromoteSucceed)
182 prometheus.MustRegister(learnerPromoteFailed)
183 prometheus.MustRegister(fdUsed)
184 prometheus.MustRegister(fdLimit)
185
186 currentVersion.With(prometheus.Labels{
187 "server_version": version.Version,
188 }).Set(1)
189 currentGoVersion.With(prometheus.Labels{
190 "server_go_version": goruntime.Version(),
191 }).Set(1)
192}
193
194func monitorFileDescriptor(lg *zap.Logger, done <-chan struct{}) {
195 // This ticker will check File Descriptor Requirements ,and count all fds in used.
196 // And recorded some logs when in used >= limit/5*4. Just recorded message.
197 // If fds was more than 10K,It's low performance due to FDUsage() works.
198 // So need to increase it.
199 // See https://github.com/etcd-io/etcd/issues/11969 for more detail.
200 ticker := time.NewTicker(10 * time.Minute)
201 defer ticker.Stop()
202 for {
203 used, err := runtime.FDUsage()
204 if err != nil {
205 lg.Warn("failed to get file descriptor usage", zap.Error(err))
206 return
207 }
208 fdUsed.Set(float64(used))
209 limit, err := runtime.FDLimit()
210 if err != nil {
211 lg.Warn("failed to get file descriptor limit", zap.Error(err))
212 return
213 }
214 fdLimit.Set(float64(limit))
215 if used >= limit/5*4 {
216 lg.Warn("80% of file descriptors are used", zap.Uint64("used", used), zap.Uint64("limit", limit))
217 }
218 select {
219 case <-ticker.C:
220 case <-done:
221 return
222 }
223 }
224}