Hung-Wei Chiu | e3c1597 | 2021-04-28 15:52:09 -0700 | [diff] [blame] | 1 | {{/* |
| 2 | # Copyright 2021-present Open Networking Foundation |
| 3 | |
Scott Baker | 4cad5ee | 2022-06-09 14:17:35 -0700 | [diff] [blame] | 4 | # SPDX-License-Identifier: Apache-2.0 |
Hung-Wei Chiu | e3c1597 | 2021-04-28 15:52:09 -0700 | [diff] [blame] | 5 | */}} |
| 6 | |
| 7 | apiVersion: v1 |
| 8 | kind: ConfigMap |
| 9 | metadata: |
| 10 | name: "tost-telegraf-config-script" |
| 11 | labels: |
| 12 | chart: "{{ .Chart.Name }}-{{ .Chart.Version }}" |
| 13 | release: "{{ .Release.Name }}" |
| 14 | app: tost-telegraf |
| 15 | data: |
Yi Tseng | 0e78d9d | 2021-10-21 18:15:03 -0700 | [diff] [blame] | 16 | sdfabric_telegraf.sh: | |
Hung-Wei Chiu | 9e5acb1 | 2021-05-19 11:32:27 -0700 | [diff] [blame] | 17 | #!/bin/bash |
Yi Tseng | 3e5a15f | 2021-10-22 17:58:36 -0700 | [diff] [blame] | 18 | # Install kubectl and jq |
Hung-Wei Chiu | 9e5acb1 | 2021-05-19 11:32:27 -0700 | [diff] [blame] | 19 | curl -sLO https://dl.k8s.io/release/v1.21.0/bin/linux/amd64/kubectl |
Yi Tseng | 3e5a15f | 2021-10-22 17:58:36 -0700 | [diff] [blame] | 20 | install -m 755 kubectl /usr/local/bin/kubectl |
| 21 | apt update |
| 22 | apt install -y jq |
| 23 | |
| 24 | # Utility for jq |
| 25 | cat <<EOF > ~/.jq |
| 26 | def count(s): reduce s as \$_ (0;.+1); |
| 27 | EOF |
| 28 | |
| 29 | get-onos-server() { |
| 30 | ONOS_SERVER={{ .Values.onos.server }} |
| 31 | CONTROLLERS=( $(kubectl get -n tost --output json pods | \ |
| 32 | jq '.items[] | select(.status.containerStatuses[].ready==true and .status.containerStatuses[].name=="onos-classic")' | \ |
| 33 | jq -r .status.podIP) ) |
| 34 | if (( ${#CONTROLLERS[@]} )); then |
| 35 | CONTROLLERS=( $(shuf -e "${CONTROLLERS[@]}") ) |
| 36 | ONOS_SERVER=${CONTROLLERS[0]} |
| 37 | fi |
| 38 | } |
Hung-Wei Chiu | e3c1597 | 2021-04-28 15:52:09 -0700 | [diff] [blame] | 39 | |
| 40 | while IFS= read -r LINE; do |
Yi Tseng | 3e5a15f | 2021-10-22 17:58:36 -0700 | [diff] [blame] | 41 | # Topology matrix |
| 42 | get-onos-server |
| 43 | ACTIVE_LINKS=$(curl --fail -sSL --user {{ .Values.onos.username }}:{{ .Values.onos.password }} --noproxy $ONOS_SERVER:{{ .Values.onos.port }} -X GET -H 'Accept: application/json' \ |
| 44 | http://$ONOS_SERVER:{{ .Values.onos.port }}/onos/v1/links | \ |
| 45 | jq 'count(.links[]?.state | select(. == "ACTIVE"))') |
| 46 | get-onos-server |
| 47 | DEVICES=$(curl --fail -sSL --user {{ .Values.onos.username }}:{{ .Values.onos.password }} --noproxy $ONOS_SERVER:{{ .Values.onos.port }} -X GET -H 'Accept: application/json' \ |
| 48 | http://$ONOS_SERVER:{{ .Values.onos.port }}/onos/v1/devices | \ |
| 49 | jq 'count(.devices[]? | select(.available and .type=="SWITCH"))') |
| 50 | get-onos-server |
| 51 | ENABLE_DEVICE_PORTS=$(curl --fail -sSL --user {{ .Values.onos.username }}:{{ .Values.onos.password }} --noproxy $ONOS_SERVER:{{ .Values.onos.port }} -X GET -H 'Accept: application/json' \ |
| 52 | http://$ONOS_SERVER:{{ .Values.onos.port }}/onos/v1/devices/ports | \ |
| 53 | jq 'count(.ports[]?.isEnabled | select(.))') |
| 54 | ACTIVE_LINKS=${ACTIVE_LINKS:-0} |
| 55 | DEVICES=${DEVICES:-0} |
| 56 | ENABLED_DEVICE_PORTS=${ENABLED_DEVICE_PORTS:-0} |
| 57 | echo "onos_telegraf active_links=${ACTIVE_LINKS},enable_device_ports=${ENABLE_DEVICE_PORTS},devices=${DEVICES}" |
| 58 | # Readiness for each ONOS instance and the config loader(overall readiness) |
| 59 | kubectl get po -n {{ .Values.onos.namespace }} -l '{{ .Values.onos.onos_classic_label }}' -o json | \ |
| 60 | jq -r '.items[]? | "onos_telegraf,pod=" + (.metadata.name) + " ready=" + (count(select(.status.containerStatuses[].ready)) | tostring)' |
| 61 | kubectl get po -n {{ .Values.onos.namespace }} -l '{{ .Values.onos.onos_config_loader_label }}' -o json | \ |
| 62 | jq -r '"onos_telegraf,pod=onos-config-loader ready=" + (count(select(.items[0].status.containerStatuses[].ready)) | tostring)' |
Hung-Wei Chiu | e3c1597 | 2021-04-28 15:52:09 -0700 | [diff] [blame] | 63 | done |
Yi Tseng | 342f4f1 | 2021-11-16 00:45:40 -0800 | [diff] [blame] | 64 | |
| 65 | sdfabric_health_indicator.sh: | |
| 66 | #!/bin/bash |
| 67 | {{ if .Values.health_indicator.enabled }} |
| 68 | # Constant |
| 69 | readonly HEALTH_UNKNOWN=0 |
| 70 | readonly HEALTH_UP=1 |
| 71 | readonly HEALTH_DEGRADED=2 |
| 72 | readonly HEALTH_DOWN=3 |
| 73 | readonly REASON_PKT_LOSS=1 |
| 74 | readonly REASON_RTT=2 |
| 75 | readonly REASON_ONOS_NOT_READY=3 |
| 76 | readonly REASON_ATOMIX_NOT_READY=4 |
| 77 | readonly REASON_LINK_DOWN=5 |
| 78 | readonly REASON_DEVICE_DOWN=6 |
| 79 | |
| 80 | PACKET_LOSS_PERCENT=0 |
| 81 | AVG_RTT=0 |
| 82 | |
| 83 | check_host() { |
| 84 | PING_RESULT=$(ping -i 0.1 -W 1 -c 10 $1) |
| 85 | PACKET_LOSS_PERCENT=$(echo $PING_RESULT | grep -P -o '\d+% packet loss' | awk -F '%' '{print $1}') |
| 86 | AVG_RTT=$(echo $PING_RESULT | grep -P -o '(\d+\.\d+)/(\d+\.\d+)/(\d+\.\d+)/(\d+\.\d+)' | awk -F'/' '{print $2}') |
| 87 | AVG_RTT=${AVG_RTT%.*} # default bash can only compare integer |
| 88 | if [[ $PACKET_LOSS_PERCENT -ge {{ .Values.health_indicator.packet_loss_threshold }} ]]; then |
| 89 | echo "sdfabric_telegraf health=$HEALTH_DOWN,reason=$REASON_PKT_LOSS" |
| 90 | return 1 |
| 91 | elif [[ $AVG_RTT -gt {{ $.Values.health_indicator.rtt_threshold_ms }} ]]; then |
| 92 | echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reson=$REASON_RTT,rtt=$AVG_RTT,expected_rtt={{ $.Values.health_indicator.rtt_threshold_ms }}" |
| 93 | return 1 |
| 94 | elif [[ $PACKET_LOSS_PERCENT -gt 0 ]]; then |
| 95 | echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reason=$REASON_PKT_LOSS,percent=$PACKET_LOSS_PERCENT" |
| 96 | return 1 |
| 97 | fi |
| 98 | return 0 |
| 99 | } |
| 100 | |
| 101 | # Wait until jq and kubectl are installed |
| 102 | while ! (which jq && which kubectl) > /dev/null ; do |
| 103 | sleep 1 |
| 104 | done |
| 105 | |
| 106 | MAX_NUM_LINKS=0 |
| 107 | |
| 108 | while IFS= read -r LINE; do |
| 109 | {{ range .Values.health_indicator.expected_hosts }} |
| 110 | check_host {{ . }} || continue |
| 111 | {{ end }} |
| 112 | |
| 113 | NUM_ATOMIX_NOT_READY=$(kubectl get po -n {{ .Values.onos.namespace }} -l '{{ .Values.onos.atomix_label }}' -o json | \ |
| 114 | jq -r 'count(select(.items[].status.containerStatuses[].ready == false))') |
| 115 | if [[ $NUM_ATOMIX_NOT_READY -gt 0 ]]; then |
| 116 | echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reason=$REASON_ATOMIX_NOT_READY,num_atomix_not_ready=$NUM_ATOMIX_NOT_READY" |
| 117 | continue |
| 118 | fi |
| 119 | |
| 120 | NUM_ONOS_NOT_READY=$(kubectl get po -n {{ .Values.onos.namespace }} -l '{{ .Values.onos.onos_classic_label }}' -o json | \ |
| 121 | jq -r 'count(select(.items[].status.containerStatuses[].ready == false))') |
| 122 | if [[ $NUM_ONOS_NOT_READY -gt 0 ]]; then |
| 123 | echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reason=$REASON_ONOS_NOT_READY,num_onos_not_ready=$NUM_ONOS_NOT_READY" |
| 124 | continue |
| 125 | fi |
| 126 | |
| 127 | ACTIVE_LINKS=$(curl --fail -sSL --user {{ .Values.onos.username }}:{{ .Values.onos.password }} --noproxy {{ .Values.onos.server }}:{{ .Values.onos.port }} -X GET -H 'Accept: application/json' \ |
| 128 | http://{{ .Values.onos.server }}:{{ .Values.onos.port }}/onos/v1/links | \ |
| 129 | jq 'count(.links[]?.state | select(. == "ACTIVE"))') |
| 130 | if [[ -z $ACTIVE_LINKS ]] || [[ ! $ACTIVE_LINKS -eq {{ .Values.health_indicator.expected_num_links }} ]]; then |
| 131 | echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reason=$REASON_LINK_DOWN,active_links=$ACTIVE_LINKS,expected_links={{ .Values.expected_num_links }}" |
| 132 | continue |
| 133 | fi |
| 134 | |
| 135 | UNAVAILABLE_DEVICES=$(curl --fail -sSL --user {{ .Values.onos.username }}:{{ .Values.onos.password }} --noproxy {{ .Values.onos.server }}:{{ .Values.onos.port }} -X GET -H 'Accept: application/json' \ |
| 136 | http://{{ .Values.onos.server }}:{{ .Values.onos.port }}/onos/v1/devices | \ |
| 137 | jq 'count(.devices[]? | select(.type=="SWITCH" and (.available == false)))') |
| 138 | if [[ $UNAVAILABLE_DEVICES -gt 0 ]]; then |
| 139 | echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reason=$REASON_DEVICE_DOWN,num_device_down=$UNAVAILABLE_DEVICES" |
| 140 | continue |
| 141 | fi |
| 142 | |
| 143 | echo "sdfabric_telegraf health=$HEALTH_UP" |
| 144 | done |
| 145 | {{ else }} |
| 146 | # Health indicator is off |
| 147 | while IFS= read -r LINE; do |
| 148 | echo "sdfabric_telegraf health=$HEALTH_UNKNOWN" |
| 149 | done |
| 150 | {{ end }} |