khenaidoo | ab1f7bd | 2019-11-14 14:00:27 -0500 | [diff] [blame] | 1 | // Copyright 2015 The etcd Authors |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | package rafthttp |
| 16 | |
| 17 | import ( |
| 18 | "time" |
| 19 | |
| 20 | "github.com/prometheus/client_golang/prometheus" |
| 21 | "github.com/xiang90/probing" |
| 22 | "go.uber.org/zap" |
| 23 | ) |
| 24 | |
| 25 | const ( |
| 26 | // RoundTripperNameRaftMessage is the name of round-tripper that sends |
| 27 | // all other Raft messages, other than "snap.Message". |
| 28 | RoundTripperNameRaftMessage = "ROUND_TRIPPER_RAFT_MESSAGE" |
| 29 | // RoundTripperNameSnapshot is the name of round-tripper that sends merged snapshot message. |
| 30 | RoundTripperNameSnapshot = "ROUND_TRIPPER_SNAPSHOT" |
| 31 | ) |
| 32 | |
| 33 | var ( |
| 34 | // proberInterval must be shorter than read timeout. |
| 35 | // Or the connection will time-out. |
| 36 | proberInterval = ConnReadTimeout - time.Second |
| 37 | statusMonitoringInterval = 30 * time.Second |
| 38 | statusErrorInterval = 5 * time.Second |
| 39 | ) |
| 40 | |
| 41 | func addPeerToProber(lg *zap.Logger, p probing.Prober, id string, us []string, roundTripperName string, rttSecProm *prometheus.HistogramVec) { |
| 42 | hus := make([]string, len(us)) |
| 43 | for i := range us { |
| 44 | hus[i] = us[i] + ProbingPrefix |
| 45 | } |
| 46 | |
| 47 | p.AddHTTP(id, proberInterval, hus) |
| 48 | |
| 49 | s, err := p.Status(id) |
| 50 | if err != nil { |
| 51 | if lg != nil { |
| 52 | lg.Warn("failed to add peer into prober", zap.String("remote-peer-id", id)) |
| 53 | } else { |
| 54 | plog.Errorf("failed to add peer %s into prober", id) |
| 55 | } |
| 56 | return |
| 57 | } |
| 58 | |
| 59 | go monitorProbingStatus(lg, s, id, roundTripperName, rttSecProm) |
| 60 | } |
| 61 | |
| 62 | func monitorProbingStatus(lg *zap.Logger, s probing.Status, id string, roundTripperName string, rttSecProm *prometheus.HistogramVec) { |
| 63 | // set the first interval short to log error early. |
| 64 | interval := statusErrorInterval |
| 65 | for { |
| 66 | select { |
| 67 | case <-time.After(interval): |
| 68 | if !s.Health() { |
| 69 | if lg != nil { |
| 70 | lg.Warn( |
| 71 | "prober detected unhealthy status", |
| 72 | zap.String("round-tripper-name", roundTripperName), |
| 73 | zap.String("remote-peer-id", id), |
| 74 | zap.Duration("rtt", s.SRTT()), |
| 75 | zap.Error(s.Err()), |
| 76 | ) |
| 77 | } else { |
| 78 | plog.Warningf("health check for peer %s could not connect: %v", id, s.Err()) |
| 79 | } |
| 80 | interval = statusErrorInterval |
| 81 | } else { |
| 82 | interval = statusMonitoringInterval |
| 83 | } |
| 84 | if s.ClockDiff() > time.Second { |
| 85 | if lg != nil { |
| 86 | lg.Warn( |
| 87 | "prober found high clock drift", |
| 88 | zap.String("round-tripper-name", roundTripperName), |
| 89 | zap.String("remote-peer-id", id), |
| 90 | zap.Duration("clock-drift", s.ClockDiff()), |
| 91 | zap.Duration("rtt", s.SRTT()), |
| 92 | zap.Error(s.Err()), |
| 93 | ) |
| 94 | } else { |
| 95 | plog.Warningf("the clock difference against peer %s is too high [%v > %v]", id, s.ClockDiff(), time.Second) |
| 96 | } |
| 97 | } |
| 98 | rttSecProm.WithLabelValues(id).Observe(s.SRTT().Seconds()) |
| 99 | |
| 100 | case <-s.StopNotify(): |
| 101 | return |
| 102 | } |
| 103 | } |
| 104 | } |