blob: d7101351e964ccf6e002dd2c9c9c8109a0a23032 [file] [log] [blame]
{{/*
# Copyright 2021-present Open Networking Foundation
# SPDX-License-Identifier: Apache-2.0
*/}}
apiVersion: v1
kind: ConfigMap
metadata:
name: "tost-telegraf-config-script"
labels:
chart: "{{ .Chart.Name }}-{{ .Chart.Version }}"
release: "{{ .Release.Name }}"
app: tost-telegraf
data:
sdfabric_telegraf.sh: |
#!/bin/bash
# Install kubectl and jq
curl -sLO https://dl.k8s.io/release/v1.21.0/bin/linux/amd64/kubectl
install -m 755 kubectl /usr/local/bin/kubectl
apt update
apt install -y jq
# Utility for jq
cat <<EOF > ~/.jq
def count(s): reduce s as \$_ (0;.+1);
EOF
get-onos-server() {
ONOS_SERVER={{ .Values.onos.server }}
CONTROLLERS=( $(kubectl get -n tost --output json pods | \
jq '.items[] | select(.status.containerStatuses[].ready==true and .status.containerStatuses[].name=="onos-classic")' | \
jq -r .status.podIP) )
if (( ${#CONTROLLERS[@]} )); then
CONTROLLERS=( $(shuf -e "${CONTROLLERS[@]}") )
ONOS_SERVER=${CONTROLLERS[0]}
fi
}
while IFS= read -r LINE; do
# Topology matrix
get-onos-server
ACTIVE_LINKS=$(curl --fail -sSL --user {{ .Values.onos.username }}:{{ .Values.onos.password }} --noproxy $ONOS_SERVER:{{ .Values.onos.port }} -X GET -H 'Accept: application/json' \
http://$ONOS_SERVER:{{ .Values.onos.port }}/onos/v1/links | \
jq 'count(.links[]?.state | select(. == "ACTIVE"))')
get-onos-server
DEVICES=$(curl --fail -sSL --user {{ .Values.onos.username }}:{{ .Values.onos.password }} --noproxy $ONOS_SERVER:{{ .Values.onos.port }} -X GET -H 'Accept: application/json' \
http://$ONOS_SERVER:{{ .Values.onos.port }}/onos/v1/devices | \
jq 'count(.devices[]? | select(.available and .type=="SWITCH"))')
get-onos-server
ENABLE_DEVICE_PORTS=$(curl --fail -sSL --user {{ .Values.onos.username }}:{{ .Values.onos.password }} --noproxy $ONOS_SERVER:{{ .Values.onos.port }} -X GET -H 'Accept: application/json' \
http://$ONOS_SERVER:{{ .Values.onos.port }}/onos/v1/devices/ports | \
jq 'count(.ports[]?.isEnabled | select(.))')
ACTIVE_LINKS=${ACTIVE_LINKS:-0}
DEVICES=${DEVICES:-0}
ENABLED_DEVICE_PORTS=${ENABLED_DEVICE_PORTS:-0}
echo "onos_telegraf active_links=${ACTIVE_LINKS},enable_device_ports=${ENABLE_DEVICE_PORTS},devices=${DEVICES}"
# Readiness for each ONOS instance and the config loader(overall readiness)
kubectl get po -n {{ .Values.onos.namespace }} -l '{{ .Values.onos.onos_classic_label }}' -o json | \
jq -r '.items[]? | "onos_telegraf,pod=" + (.metadata.name) + " ready=" + (count(select(.status.containerStatuses[].ready)) | tostring)'
kubectl get po -n {{ .Values.onos.namespace }} -l '{{ .Values.onos.onos_config_loader_label }}' -o json | \
jq -r '"onos_telegraf,pod=onos-config-loader ready=" + (count(select(.items[0].status.containerStatuses[].ready)) | tostring)'
done
sdfabric_health_indicator.sh: |
#!/bin/bash
{{ if .Values.health_indicator.enabled }}
# Constant
readonly HEALTH_UNKNOWN=0
readonly HEALTH_UP=1
readonly HEALTH_DEGRADED=2
readonly HEALTH_DOWN=3
readonly REASON_PKT_LOSS=1
readonly REASON_RTT=2
readonly REASON_ONOS_NOT_READY=3
readonly REASON_ATOMIX_NOT_READY=4
readonly REASON_LINK_DOWN=5
readonly REASON_DEVICE_DOWN=6
PACKET_LOSS_PERCENT=0
AVG_RTT=0
check_host() {
PING_RESULT=$(ping -i 0.1 -W 1 -c 10 $1)
PACKET_LOSS_PERCENT=$(echo $PING_RESULT | grep -P -o '\d+% packet loss' | awk -F '%' '{print $1}')
AVG_RTT=$(echo $PING_RESULT | grep -P -o '(\d+\.\d+)/(\d+\.\d+)/(\d+\.\d+)/(\d+\.\d+)' | awk -F'/' '{print $2}')
AVG_RTT=${AVG_RTT%.*} # default bash can only compare integer
if [[ $PACKET_LOSS_PERCENT -ge {{ .Values.health_indicator.packet_loss_threshold }} ]]; then
echo "sdfabric_telegraf health=$HEALTH_DOWN,reason=$REASON_PKT_LOSS"
return 1
elif [[ $AVG_RTT -gt {{ $.Values.health_indicator.rtt_threshold_ms }} ]]; then
echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reson=$REASON_RTT,rtt=$AVG_RTT,expected_rtt={{ $.Values.health_indicator.rtt_threshold_ms }}"
return 1
elif [[ $PACKET_LOSS_PERCENT -gt 0 ]]; then
echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reason=$REASON_PKT_LOSS,percent=$PACKET_LOSS_PERCENT"
return 1
fi
return 0
}
# Wait until jq and kubectl are installed
while ! (which jq && which kubectl) > /dev/null ; do
sleep 1
done
MAX_NUM_LINKS=0
while IFS= read -r LINE; do
{{ range .Values.health_indicator.expected_hosts }}
check_host {{ . }} || continue
{{ end }}
NUM_ATOMIX_NOT_READY=$(kubectl get po -n {{ .Values.onos.namespace }} -l '{{ .Values.onos.atomix_label }}' -o json | \
jq -r 'count(select(.items[].status.containerStatuses[].ready == false))')
if [[ $NUM_ATOMIX_NOT_READY -gt 0 ]]; then
echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reason=$REASON_ATOMIX_NOT_READY,num_atomix_not_ready=$NUM_ATOMIX_NOT_READY"
continue
fi
NUM_ONOS_NOT_READY=$(kubectl get po -n {{ .Values.onos.namespace }} -l '{{ .Values.onos.onos_classic_label }}' -o json | \
jq -r 'count(select(.items[].status.containerStatuses[].ready == false))')
if [[ $NUM_ONOS_NOT_READY -gt 0 ]]; then
echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reason=$REASON_ONOS_NOT_READY,num_onos_not_ready=$NUM_ONOS_NOT_READY"
continue
fi
ACTIVE_LINKS=$(curl --fail -sSL --user {{ .Values.onos.username }}:{{ .Values.onos.password }} --noproxy {{ .Values.onos.server }}:{{ .Values.onos.port }} -X GET -H 'Accept: application/json' \
http://{{ .Values.onos.server }}:{{ .Values.onos.port }}/onos/v1/links | \
jq 'count(.links[]?.state | select(. == "ACTIVE"))')
if [[ -z $ACTIVE_LINKS ]] || [[ ! $ACTIVE_LINKS -eq {{ .Values.health_indicator.expected_num_links }} ]]; then
echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reason=$REASON_LINK_DOWN,active_links=$ACTIVE_LINKS,expected_links={{ .Values.expected_num_links }}"
continue
fi
UNAVAILABLE_DEVICES=$(curl --fail -sSL --user {{ .Values.onos.username }}:{{ .Values.onos.password }} --noproxy {{ .Values.onos.server }}:{{ .Values.onos.port }} -X GET -H 'Accept: application/json' \
http://{{ .Values.onos.server }}:{{ .Values.onos.port }}/onos/v1/devices | \
jq 'count(.devices[]? | select(.type=="SWITCH" and (.available == false)))')
if [[ $UNAVAILABLE_DEVICES -gt 0 ]]; then
echo "sdfabric_telegraf health=$HEALTH_DEGRADED,reason=$REASON_DEVICE_DOWN,num_device_down=$UNAVAILABLE_DEVICES"
continue
fi
echo "sdfabric_telegraf health=$HEALTH_UP"
done
{{ else }}
# Health indicator is off
while IFS= read -r LINE; do
echo "sdfabric_telegraf health=$HEALTH_UNKNOWN"
done
{{ end }}