blob: d7101351e964ccf6e002dd2c9c9c8109a0a23032 [file] [log] [blame]
Hung-Wei Chiue3c15972021-04-28 15:52:09 -07001{{/*
2# Copyright 2021-present Open Networking Foundation
3
Scott Baker4cad5ee2022-06-09 14:17:35 -07004# SPDX-License-Identifier: Apache-2.0
Hung-Wei Chiue3c15972021-04-28 15:52:09 -07005*/}}
6
7apiVersion: v1
8kind: ConfigMap
9metadata:
10 name: "tost-telegraf-config-script"
11 labels:
12 chart: "{{ .Chart.Name }}-{{ .Chart.Version }}"
13 release: "{{ .Release.Name }}"
14 app: tost-telegraf
15data:
Yi Tseng0e78d9d2021-10-21 18:15:03 -070016 sdfabric_telegraf.sh: |
Hung-Wei Chiu9e5acb12021-05-19 11:32:27 -070017 #!/bin/bash
Yi Tseng3e5a15f2021-10-22 17:58:36 -070018 # Install kubectl and jq
Hung-Wei Chiu9e5acb12021-05-19 11:32:27 -070019 curl -sLO https://dl.k8s.io/release/v1.21.0/bin/linux/amd64/kubectl
Yi Tseng3e5a15f2021-10-22 17:58:36 -070020 install -m 755 kubectl /usr/local/bin/kubectl
21 apt update
22 apt install -y jq
23
24 # Utility for jq
25 cat <<EOF > ~/.jq
26 def count(s): reduce s as \$_ (0;.+1);
27 EOF
28
29 get-onos-server() {
30 ONOS_SERVER={{ .Values.onos.server }}
31 CONTROLLERS=( $(kubectl get -n tost --output json pods | \
32 jq '.items[] | select(.status.containerStatuses[].ready==true and .status.containerStatuses[].name=="onos-classic")' | \
33 jq -r .status.podIP) )
34 if (( ${#CONTROLLERS[@]} )); then
35 CONTROLLERS=( $(shuf -e "${CONTROLLERS[@]}") )
36 ONOS_SERVER=${CONTROLLERS[0]}
37 fi
38 }
Hung-Wei Chiue3c15972021-04-28 15:52:09 -070039
40 while IFS= read -r LINE; do
Yi Tseng3e5a15f2021-10-22 17:58:36 -070041 # Topology matrix
42 get-onos-server
43 ACTIVE_LINKS=$(curl --fail -sSL --user {{ .Values.onos.username }}:{{ .Values.onos.password }} --noproxy $ONOS_SERVER:{{ .Values.onos.port }} -X GET -H 'Accept: application/json' \
44 http://$ONOS_SERVER:{{ .Values.onos.port }}/onos/v1/links | \
45 jq 'count(.links[]?.state | select(. == "ACTIVE"))')
46 get-onos-server
47 DEVICES=$(curl --fail -sSL --user {{ .Values.onos.username }}:{{ .Values.onos.password }} --noproxy $ONOS_SERVER:{{ .Values.onos.port }} -X GET -H 'Accept: application/json' \
48 http://$ONOS_SERVER:{{ .Values.onos.port }}/onos/v1/devices | \
49 jq 'count(.devices[]? | select(.available and .type=="SWITCH"))')
50 get-onos-server
51 ENABLE_DEVICE_PORTS=$(curl --fail -sSL --user {{ .Values.onos.username }}:{{ .Values.onos.password }} --noproxy $ONOS_SERVER:{{ .Values.onos.port }} -X GET -H 'Accept: application/json' \
52 http://$ONOS_SERVER:{{ .Values.onos.port }}/onos/v1/devices/ports | \
53 jq 'count(.ports[]?.isEnabled | select(.))')
54 ACTIVE_LINKS=${ACTIVE_LINKS:-0}
55 DEVICES=${DEVICES:-0}
56 ENABLED_DEVICE_PORTS=${ENABLED_DEVICE_PORTS:-0}
57 echo "onos_telegraf active_links=${ACTIVE_LINKS},enable_device_ports=${ENABLE_DEVICE_PORTS},devices=${DEVICES}"
58 # Readiness for each ONOS instance and the config loader(overall readiness)
59 kubectl get po -n {{ .Values.onos.namespace }} -l '{{ .Values.onos.onos_classic_label }}' -o json | \
60 jq -r '.items[]? | "onos_telegraf,pod=" + (.metadata.name) + " ready=" + (count(select(.status.containerStatuses[].ready)) | tostring)'
61 kubectl get po -n {{ .Values.onos.namespace }} -l '{{ .Values.onos.onos_config_loader_label }}' -o json | \
62 jq -r '"onos_telegraf,pod=onos-config-loader ready=" + (count(select(.items[0].status.containerStatuses[].ready)) | tostring)'
Hung-Wei Chiue3c15972021-04-28 15:52:09 -070063 done
Yi Tseng342f4f12021-11-16 00:45:40 -080064
65 sdfabric_health_indicator.sh: |
66 #!/bin/bash
67 {{ if .Values.health_indicator.enabled }}
68 # Constant
69 readonly HEALTH_UNKNOWN=0
70 readonly HEALTH_UP=1
71 readonly HEALTH_DEGRADED=2
72 readonly HEALTH_DOWN=3
73 readonly REASON_PKT_LOSS=1
74 readonly REASON_RTT=2
75 readonly REASON_ONOS_NOT_READY=3
76 readonly REASON_ATOMIX_NOT_READY=4
77 readonly REASON_LINK_DOWN=5
78 readonly REASON_DEVICE_DOWN=6
79
80 PACKET_LOSS_PERCENT=0
81 AVG_RTT=0
82
83 check_host() {
84 PING_RESULT=$(ping -i 0.1 -W 1 -c 10 $1)
85 PACKET_LOSS_PERCENT=$(echo $PING_RESULT | grep -P -o '\d+% packet loss' | awk -F '%' '{print $1}')
86 AVG_RTT=$(echo $PING_RESULT | grep -P -o '(\d+\.\d+)/(\d+\.\d+)/(\d+\.\d+)/(\d+\.\d+)' | awk -F'/' '{print $2}')
87 AVG_RTT=${AVG_RTT%.*} # default bash can only compare integer
88 if [[ $PACKET_LOSS_PERCENT -ge {{ .Values.health_indicator.packet_loss_threshold }} ]]; then
89 echo "sdfabric_telegraf health=$HEALTH_DOWN,reason=$REASON_PKT_LOSS"
90 return 1
91 elif [[ $AVG_RTT -gt {{ $.Values.health_indicator.rtt_threshold_ms }} ]]; then
92 echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reson=$REASON_RTT,rtt=$AVG_RTT,expected_rtt={{ $.Values.health_indicator.rtt_threshold_ms }}"
93 return 1
94 elif [[ $PACKET_LOSS_PERCENT -gt 0 ]]; then
95 echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reason=$REASON_PKT_LOSS,percent=$PACKET_LOSS_PERCENT"
96 return 1
97 fi
98 return 0
99 }
100
101 # Wait until jq and kubectl are installed
102 while ! (which jq && which kubectl) > /dev/null ; do
103 sleep 1
104 done
105
106 MAX_NUM_LINKS=0
107
108 while IFS= read -r LINE; do
109 {{ range .Values.health_indicator.expected_hosts }}
110 check_host {{ . }} || continue
111 {{ end }}
112
113 NUM_ATOMIX_NOT_READY=$(kubectl get po -n {{ .Values.onos.namespace }} -l '{{ .Values.onos.atomix_label }}' -o json | \
114 jq -r 'count(select(.items[].status.containerStatuses[].ready == false))')
115 if [[ $NUM_ATOMIX_NOT_READY -gt 0 ]]; then
116 echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reason=$REASON_ATOMIX_NOT_READY,num_atomix_not_ready=$NUM_ATOMIX_NOT_READY"
117 continue
118 fi
119
120 NUM_ONOS_NOT_READY=$(kubectl get po -n {{ .Values.onos.namespace }} -l '{{ .Values.onos.onos_classic_label }}' -o json | \
121 jq -r 'count(select(.items[].status.containerStatuses[].ready == false))')
122 if [[ $NUM_ONOS_NOT_READY -gt 0 ]]; then
123 echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reason=$REASON_ONOS_NOT_READY,num_onos_not_ready=$NUM_ONOS_NOT_READY"
124 continue
125 fi
126
127 ACTIVE_LINKS=$(curl --fail -sSL --user {{ .Values.onos.username }}:{{ .Values.onos.password }} --noproxy {{ .Values.onos.server }}:{{ .Values.onos.port }} -X GET -H 'Accept: application/json' \
128 http://{{ .Values.onos.server }}:{{ .Values.onos.port }}/onos/v1/links | \
129 jq 'count(.links[]?.state | select(. == "ACTIVE"))')
130 if [[ -z $ACTIVE_LINKS ]] || [[ ! $ACTIVE_LINKS -eq {{ .Values.health_indicator.expected_num_links }} ]]; then
131 echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reason=$REASON_LINK_DOWN,active_links=$ACTIVE_LINKS,expected_links={{ .Values.expected_num_links }}"
132 continue
133 fi
134
135 UNAVAILABLE_DEVICES=$(curl --fail -sSL --user {{ .Values.onos.username }}:{{ .Values.onos.password }} --noproxy {{ .Values.onos.server }}:{{ .Values.onos.port }} -X GET -H 'Accept: application/json' \
136 http://{{ .Values.onos.server }}:{{ .Values.onos.port }}/onos/v1/devices | \
137 jq 'count(.devices[]? | select(.type=="SWITCH" and (.available == false)))')
138 if [[ $UNAVAILABLE_DEVICES -gt 0 ]]; then
139 echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reason=$REASON_DEVICE_DOWN,num_device_down=$UNAVAILABLE_DEVICES"
140 continue
141 fi
142
143 echo "sdfabric_telegraf health=$HEALTH_UP"
144 done
145 {{ else }}
146 # Health indicator is off
147 while IFS= read -r LINE; do
148 echo "sdfabric_telegraf health=$HEALTH_UNKNOWN"
149 done
150 {{ end }}