blob: 9048349748f684a8b8e9d4bd2f2c9b7d001686d5 [file] [log] [blame]
#!/usr/bin/env python
# Copyright 2020-present Open Networking Foundation
#
# SPDX-License-Identifier: LicenseRef-ONF-Member-Only-1.0
import sys
import os
import json
import enum
import time
import datetime
import requests
from pytz import timezone
from datetime import date
from collections import namedtuple
from rancher import Client as RancherClient
from slack import WebClient as SlackClient
from slack.errors import SlackApiError
class Status(enum.Enum):
healthy = 1
unhealthy = 2
error = 3 # check failed
unavailable = 4 # check not automated
CONF = json.loads(
open(os.getenv('CONFIG_FILE', "./config.json")).read(),
object_hook=lambda d: namedtuple('X', d.keys())(*d.values())
)
EMOJI = {
Status.healthy: CONF.slack.emoji.healthy,
Status.unhealthy: CONF.slack.emoji.unhealthy,
Status.error: CONF.slack.emoji.error,
Status.unavailable: CONF.slack.emoji.unavailable
}
HEALTHY_STATES = ["active", "connected"]
REPORT_HEADER = '''\
*******************************************************************************
'''
REPORT_HEADER += CONF.slack.report_header
REPORT_HEADER += date.today().strftime(" %d/%m/%Y\n")
REPORT_HEADER += EMOJI[Status.healthy] + " Healthy "
REPORT_HEADER += EMOJI[Status.unhealthy] + " Unhealthy "
REPORT_HEADER += EMOJI[Status.error] + " Checks failed "
REPORT_HEADER += EMOJI[Status.unavailable] + " Checks not automated\n"
REPORT_HEADER += '''\
*******************************************************************************
'''
MSG_PREFIX_BLANK = " :warning-sign: "
MSG_PREFIX_ERROR = " [ERROR]: "
def _get_status(state_string):
if state_string in HEALTHY_STATES:
return Status.healthy
else:
return Status.unhealthy
def _get_report(name, status, msg=[]):
report = []
if not status:
return []
report += [EMOJI[status] + " " + name]
msg_prefix = MSG_PREFIX_ERROR if status == Status.error else MSG_PREFIX_BLANK
for m in msg:
report += [msg_prefix + m]
return report
def _run_adb_command(adb_path, ssh_client, command):
final_command = adb_path + " shell " + command
stdin, stdout, stderr = ssh_client.exec_command(final_command)
error = stderr.read().decode("ascii").strip("\n")
if "Error" in error:
msg = "failed to run command: " + final_command
return False, msg
output = stdout.read().decode("ascii").strip("\n")
time.sleep(2)
return True, output
def get_project_status(cluster, project):
status = ""
msg = []
projects = cluster.projects(name=project.name)
check_type = project.check_type
unhealthy = []
for p in projects:
targets = p.apps() if check_type == "app" else p.workloads()
for target in targets.data:
status = _get_status(target.state)
if status == Status.unhealthy:
warning = check_type + " " + target.name + " is unhealthy"
unhealthy += [warning]
if status == Status.unhealthy:
msg += unhealthy
return status, msg
def get_aether_network_status(edge_name):
status = ""
msg = []
try:
req_url = CONF.edge_monitoring.api_url + "/" + edge_name
response = requests.get(req_url)
if response.status_code == 404:
return Status.unavailable, []
response.raise_for_status()
except Exception as e:
return Status.error, str(e)
json_resp = json.loads(response.text)['edge']
last_update = datetime.datetime.fromtimestamp(json_resp['last_update'])
time_diff = datetime.datetime.now() - last_update
time_diff_mins = int(round(time_diff.total_seconds() / 60))
if time_diff_mins > 10:
msg += ['status report not received for ' + str(time_diff_mins) + 'min']
return Status.error, msg
status = Status.healthy
cp_status = _get_status(json_resp['status']['control_plane'])
if cp_status is not Status.healthy:
status = Status.unhealthy
msg += ["control plane is not healthy"]
up_status = _get_status(json_resp['status']['user_plane'])
if up_status is not Status.healthy:
status = Status.unhealthy
msg += ["user plane is not healthy"]
return status, msg
def get_k8s_status(cluster):
status = ""
msg = []
status = _get_status(cluster.state)
if cluster.state == "unavailable" or status is Status.unhealthy:
msg += [cluster.transitioningMessage]
return Status.error, msg
for component in cluster.componentStatuses:
for condition in component.conditions:
if condition.status != "True":
msg += [component.name + " is unhealthy"]
for node in cluster.nodes():
if _get_status(node.state) != Status.healthy:
msg += [node.hostname + " is unhealthy"]
return status, msg
def get_cluster_health_report(cluster, edge=False):
status = ""
msg = []
report = ["*[" + cluster.name + "]*"]
# Check K8S API health
k8s_status, msg = get_k8s_status(cluster)
report += _get_report("Kubernetes", k8s_status, msg)
# Check managed project health
for project in CONF.managed_projects:
status = ""
msg = []
if k8s_status == Status.healthy:
status, msg = get_project_status(cluster, project)
else:
status = Status.error
report += _get_report(project.display, status, msg)
# Check Aether network health for Edges
# TODO: separate report for control plane and user plane
if edge:
status, msg = get_aether_network_status(cluster.name)
report += _get_report("Aether Network", status, msg)
report_string = "\n".join(report)
return report_string
def main():
report = REPORT_HEADER
# Get cluster status from Rancher
try:
rancher_client = RancherClient(
url=CONF.rancher.api_url, access_key=CONF.rancher.access_key,
secret_key=CONF.rancher.secret_key)
response = rancher_client.list_cluster()
except Exception as e:
report += _get_report("Rancher", Status.error, [str(e)])
response = namedtuple('X', "data")([])
# Check cluster health and make a report
for cluster in response.data:
if "production" in cluster.name:
edge = True if "edge" in cluster.name else False
report += get_cluster_health_report(cluster, edge)
report += "\n\n"
# Publish the report to Slack channel
try:
slack_client = SlackClient(token=CONF.slack.api_token)
response = slack_client.chat_postMessage(
channel=CONF.slack.channel,
text=report)
except SlackApiError as e:
assert e.response["ok"] is False
assert e.response["error"]
print(f"Got an error: {e.response['error']}")
if __name__ == "__main__":
main()