blob: e0c0cde85538cd5e76197257b512b22e7a1870c5 [file] [log] [blame]
khenaidoo59ce9dd2019-11-11 13:05:32 -05001// Copyright 2015 The etcd Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package etcdserver
16
17import (
18 goruntime "runtime"
19 "time"
20
21 "go.etcd.io/etcd/pkg/runtime"
22 "go.etcd.io/etcd/version"
23
24 "github.com/prometheus/client_golang/prometheus"
25 "go.uber.org/zap"
26)
27
28var (
29 hasLeader = prometheus.NewGauge(prometheus.GaugeOpts{
30 Namespace: "etcd",
31 Subsystem: "server",
32 Name: "has_leader",
33 Help: "Whether or not a leader exists. 1 is existence, 0 is not.",
34 })
35 isLeader = prometheus.NewGauge(prometheus.GaugeOpts{
36 Namespace: "etcd",
37 Subsystem: "server",
38 Name: "is_leader",
39 Help: "Whether or not this member is a leader. 1 if is, 0 otherwise.",
40 })
41 leaderChanges = prometheus.NewCounter(prometheus.CounterOpts{
42 Namespace: "etcd",
43 Subsystem: "server",
44 Name: "leader_changes_seen_total",
45 Help: "The number of leader changes seen.",
46 })
47 isLearner = prometheus.NewGauge(prometheus.GaugeOpts{
48 Namespace: "etcd",
49 Subsystem: "server",
50 Name: "is_learner",
51 Help: "Whether or not this member is a learner. 1 if is, 0 otherwise.",
52 })
53 learnerPromoteFailed = prometheus.NewCounterVec(prometheus.CounterOpts{
54 Namespace: "etcd",
55 Subsystem: "server",
56 Name: "learner_promote_failures",
57 Help: "The total number of failed learner promotions (likely learner not ready) while this member is leader.",
58 },
59 []string{"Reason"},
60 )
61 learnerPromoteSucceed = prometheus.NewCounter(prometheus.CounterOpts{
62 Namespace: "etcd",
63 Subsystem: "server",
64 Name: "learner_promote_successes",
65 Help: "The total number of successful learner promotions while this member is leader.",
66 })
67 heartbeatSendFailures = prometheus.NewCounter(prometheus.CounterOpts{
68 Namespace: "etcd",
69 Subsystem: "server",
70 Name: "heartbeat_send_failures_total",
71 Help: "The total number of leader heartbeat send failures (likely overloaded from slow disk).",
72 })
73 slowApplies = prometheus.NewCounter(prometheus.CounterOpts{
74 Namespace: "etcd",
75 Subsystem: "server",
76 Name: "slow_apply_total",
77 Help: "The total number of slow apply requests (likely overloaded from slow disk).",
78 })
79 applySnapshotInProgress = prometheus.NewGauge(prometheus.GaugeOpts{
80 Namespace: "etcd",
81 Subsystem: "server",
82 Name: "snapshot_apply_in_progress_total",
83 Help: "1 if the server is applying the incoming snapshot. 0 if none.",
84 })
85 proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{
86 Namespace: "etcd",
87 Subsystem: "server",
88 Name: "proposals_committed_total",
89 Help: "The total number of consensus proposals committed.",
90 })
91 proposalsApplied = prometheus.NewGauge(prometheus.GaugeOpts{
92 Namespace: "etcd",
93 Subsystem: "server",
94 Name: "proposals_applied_total",
95 Help: "The total number of consensus proposals applied.",
96 })
97 proposalsPending = prometheus.NewGauge(prometheus.GaugeOpts{
98 Namespace: "etcd",
99 Subsystem: "server",
100 Name: "proposals_pending",
101 Help: "The current number of pending proposals to commit.",
102 })
103 proposalsFailed = prometheus.NewCounter(prometheus.CounterOpts{
104 Namespace: "etcd",
105 Subsystem: "server",
106 Name: "proposals_failed_total",
107 Help: "The total number of failed proposals seen.",
108 })
109 slowReadIndex = prometheus.NewCounter(prometheus.CounterOpts{
110 Namespace: "etcd",
111 Subsystem: "server",
112 Name: "slow_read_indexes_total",
113 Help: "The total number of pending read indexes not in sync with leader's or timed out read index requests.",
114 })
115 readIndexFailed = prometheus.NewCounter(prometheus.CounterOpts{
116 Namespace: "etcd",
117 Subsystem: "server",
118 Name: "read_indexes_failed_total",
119 Help: "The total number of failed read indexes seen.",
120 })
121 leaseExpired = prometheus.NewCounter(prometheus.CounterOpts{
122 Namespace: "etcd_debugging",
123 Subsystem: "server",
124 Name: "lease_expired_total",
125 Help: "The total number of expired leases.",
126 })
127 quotaBackendBytes = prometheus.NewGauge(prometheus.GaugeOpts{
128 Namespace: "etcd",
129 Subsystem: "server",
130 Name: "quota_backend_bytes",
131 Help: "Current backend storage quota size in bytes.",
132 })
133 currentVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
134 Namespace: "etcd",
135 Subsystem: "server",
136 Name: "version",
137 Help: "Which version is running. 1 for 'server_version' label with current version.",
138 },
139 []string{"server_version"})
140 currentGoVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
141 Namespace: "etcd",
142 Subsystem: "server",
143 Name: "go_version",
144 Help: "Which Go version server is running with. 1 for 'server_go_version' label with current version.",
145 },
146 []string{"server_go_version"})
147 serverID = prometheus.NewGaugeVec(prometheus.GaugeOpts{
148 Namespace: "etcd",
149 Subsystem: "server",
150 Name: "id",
151 Help: "Server or member ID in hexadecimal format. 1 for 'server_id' label with current ID.",
152 },
153 []string{"server_id"})
154)
155
156func init() {
157 prometheus.MustRegister(hasLeader)
158 prometheus.MustRegister(isLeader)
159 prometheus.MustRegister(leaderChanges)
160 prometheus.MustRegister(heartbeatSendFailures)
161 prometheus.MustRegister(slowApplies)
162 prometheus.MustRegister(applySnapshotInProgress)
163 prometheus.MustRegister(proposalsCommitted)
164 prometheus.MustRegister(proposalsApplied)
165 prometheus.MustRegister(proposalsPending)
166 prometheus.MustRegister(proposalsFailed)
167 prometheus.MustRegister(slowReadIndex)
168 prometheus.MustRegister(readIndexFailed)
169 prometheus.MustRegister(leaseExpired)
170 prometheus.MustRegister(quotaBackendBytes)
171 prometheus.MustRegister(currentVersion)
172 prometheus.MustRegister(currentGoVersion)
173 prometheus.MustRegister(serverID)
174 prometheus.MustRegister(isLearner)
175 prometheus.MustRegister(learnerPromoteSucceed)
176 prometheus.MustRegister(learnerPromoteFailed)
177
178 currentVersion.With(prometheus.Labels{
179 "server_version": version.Version,
180 }).Set(1)
181 currentGoVersion.With(prometheus.Labels{
182 "server_go_version": goruntime.Version(),
183 }).Set(1)
184}
185
186func monitorFileDescriptor(lg *zap.Logger, done <-chan struct{}) {
187 ticker := time.NewTicker(5 * time.Second)
188 defer ticker.Stop()
189 for {
190 used, err := runtime.FDUsage()
191 if err != nil {
192 if lg != nil {
193 lg.Warn("failed to get file descriptor usage", zap.Error(err))
194 } else {
195 plog.Errorf("cannot monitor file descriptor usage (%v)", err)
196 }
197 return
198 }
199 limit, err := runtime.FDLimit()
200 if err != nil {
201 if lg != nil {
202 lg.Warn("failed to get file descriptor limit", zap.Error(err))
203 } else {
204 plog.Errorf("cannot monitor file descriptor usage (%v)", err)
205 }
206 return
207 }
208 if used >= limit/5*4 {
209 if lg != nil {
210 lg.Warn("80% of file descriptors are used", zap.Uint64("used", used), zap.Uint64("limit", limit))
211 } else {
212 plog.Warningf("80%% of the file descriptor limit is used [used = %d, limit = %d]", used, limit)
213 }
214 }
215 select {
216 case <-ticker.C:
217 case <-done:
218 return
219 }
220 }
221}