blob: d543c30f48369856473068e72adca37a9979a9c3 [file] [log] [blame]
khenaidood948f772021-08-11 17:49:24 -04001// Copyright 2015 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package etcdserver
16
17import (
18 goruntime "runtime"
19 "time"
20
21 "github.com/coreos/etcd/pkg/runtime"
22 "github.com/coreos/etcd/version"
23 "github.com/prometheus/client_golang/prometheus"
24)
25
26var (
27 hasLeader = prometheus.NewGauge(prometheus.GaugeOpts{
28 Namespace: "etcd",
29 Subsystem: "server",
30 Name: "has_leader",
31 Help: "Whether or not a leader exists. 1 is existence, 0 is not.",
32 })
33 isLeader = prometheus.NewGauge(prometheus.GaugeOpts{
34 Namespace: "etcd",
35 Subsystem: "server",
36 Name: "is_leader",
37 Help: "Whether or not this member is a leader. 1 if is, 0 otherwise.",
38 })
39 leaderChanges = prometheus.NewCounter(prometheus.CounterOpts{
40 Namespace: "etcd",
41 Subsystem: "server",
42 Name: "leader_changes_seen_total",
43 Help: "The number of leader changes seen.",
44 })
45 heartbeatSendFailures = prometheus.NewCounter(prometheus.CounterOpts{
46 Namespace: "etcd",
47 Subsystem: "server",
48 Name: "heartbeat_send_failures_total",
49 Help: "The total number of leader heartbeat send failures (likely overloaded from slow disk).",
50 })
51 slowApplies = prometheus.NewCounter(prometheus.CounterOpts{
52 Namespace: "etcd",
53 Subsystem: "server",
54 Name: "slow_apply_total",
55 Help: "The total number of slow apply requests (likely overloaded from slow disk).",
56 })
57 applySnapshotInProgress = prometheus.NewGauge(prometheus.GaugeOpts{
58 Namespace: "etcd",
59 Subsystem: "server",
60 Name: "snapshot_apply_in_progress_total",
61 Help: "1 if the server is applying the incoming snapshot. 0 if none.",
62 })
63 proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{
64 Namespace: "etcd",
65 Subsystem: "server",
66 Name: "proposals_committed_total",
67 Help: "The total number of consensus proposals committed.",
68 })
69 proposalsApplied = prometheus.NewGauge(prometheus.GaugeOpts{
70 Namespace: "etcd",
71 Subsystem: "server",
72 Name: "proposals_applied_total",
73 Help: "The total number of consensus proposals applied.",
74 })
75 proposalsPending = prometheus.NewGauge(prometheus.GaugeOpts{
76 Namespace: "etcd",
77 Subsystem: "server",
78 Name: "proposals_pending",
79 Help: "The current number of pending proposals to commit.",
80 })
81 proposalsFailed = prometheus.NewCounter(prometheus.CounterOpts{
82 Namespace: "etcd",
83 Subsystem: "server",
84 Name: "proposals_failed_total",
85 Help: "The total number of failed proposals seen.",
86 })
87 leaseExpired = prometheus.NewCounter(prometheus.CounterOpts{
88 Namespace: "etcd_debugging",
89 Subsystem: "server",
90 Name: "lease_expired_total",
91 Help: "The total number of expired leases.",
92 })
93 slowReadIndex = prometheus.NewCounter(prometheus.CounterOpts{
94 Namespace: "etcd",
95 Subsystem: "server",
96 Name: "slow_read_indexes_total",
97 Help: "The total number of pending read indexes not in sync with leader's or timed out read index requests.",
98 })
99 readIndexFailed = prometheus.NewCounter(prometheus.CounterOpts{
100 Namespace: "etcd",
101 Subsystem: "server",
102 Name: "read_indexes_failed_total",
103 Help: "The total number of failed read indexes seen.",
104 })
105 quotaBackendBytes = prometheus.NewGauge(prometheus.GaugeOpts{
106 Namespace: "etcd",
107 Subsystem: "server",
108 Name: "quota_backend_bytes",
109 Help: "Current backend storage quota size in bytes.",
110 })
111 currentVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
112 Namespace: "etcd",
113 Subsystem: "server",
114 Name: "version",
115 Help: "Which version is running. 1 for 'server_version' label with current version.",
116 },
117 []string{"server_version"})
118 currentGoVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
119 Namespace: "etcd",
120 Subsystem: "server",
121 Name: "go_version",
122 Help: "Which Go version server is running with. 1 for 'server_go_version' label with current version.",
123 },
124 []string{"server_go_version"})
125 serverID = prometheus.NewGaugeVec(prometheus.GaugeOpts{
126 Namespace: "etcd",
127 Subsystem: "server",
128 Name: "id",
129 Help: "Server or member ID in hexadecimal format. 1 for 'server_id' label with current ID.",
130 },
131 []string{"server_id"})
132
133 fdUsed = prometheus.NewGauge(prometheus.GaugeOpts{
134 Namespace: "os",
135 Subsystem: "fd",
136 Name: "used",
137 Help: "The number of used file descriptors.",
138 })
139 fdLimit = prometheus.NewGauge(prometheus.GaugeOpts{
140 Namespace: "os",
141 Subsystem: "fd",
142 Name: "limit",
143 Help: "The file descriptor limit.",
144 })
145)
146
147func init() {
148 prometheus.MustRegister(hasLeader)
149 prometheus.MustRegister(isLeader)
150 prometheus.MustRegister(leaderChanges)
151 prometheus.MustRegister(heartbeatSendFailures)
152 prometheus.MustRegister(slowApplies)
153 prometheus.MustRegister(applySnapshotInProgress)
154 prometheus.MustRegister(proposalsCommitted)
155 prometheus.MustRegister(proposalsApplied)
156 prometheus.MustRegister(proposalsPending)
157 prometheus.MustRegister(proposalsFailed)
158 prometheus.MustRegister(leaseExpired)
159 prometheus.MustRegister(slowReadIndex)
160 prometheus.MustRegister(readIndexFailed)
161 prometheus.MustRegister(quotaBackendBytes)
162 prometheus.MustRegister(currentVersion)
163 prometheus.MustRegister(currentGoVersion)
164 prometheus.MustRegister(serverID)
165 prometheus.MustRegister(fdUsed)
166 prometheus.MustRegister(fdLimit)
167
168 currentVersion.With(prometheus.Labels{
169 "server_version": version.Version,
170 }).Set(1)
171 currentGoVersion.With(prometheus.Labels{
172 "server_go_version": goruntime.Version(),
173 }).Set(1)
174}
175
176func monitorFileDescriptor(done <-chan struct{}) {
177 // This ticker will check File Descriptor Requirements ,and count all fds in used.
178 // And recorded some logs when in used >= limit/5*4. Just recorded message.
179 // If fds was more than 10K,It's low performance due to FDUsage() works.
180 // So need to increase it.
181 // See https://github.com/etcd-io/etcd/issues/11969 for more detail.
182 ticker := time.NewTicker(10 * time.Minute)
183 defer ticker.Stop()
184 for {
185 used, err := runtime.FDUsage()
186 if err != nil {
187 plog.Errorf("cannot monitor file descriptor usage (%v)", err)
188 return
189 }
190 fdUsed.Set(float64(used))
191 limit, err := runtime.FDLimit()
192 if err != nil {
193 plog.Errorf("cannot monitor file descriptor usage (%v)", err)
194 return
195 }
196 fdLimit.Set(float64(limit))
197 if used >= limit/5*4 {
198 plog.Warningf("80%% of the file descriptor limit is used [used = %d, limit = %d]", used, limit)
199 }
200 select {
201 case <-ticker.C:
202 case <-done:
203 return
204 }
205 }
206}