slack-notifier/aether_status_notifier.py - aether-monitoring - Gitiles

 #!/usr/bin/env python

 # Copyright 2020-present Open Networking Foundation
 #
 # SPDX-License-Identifier: LicenseRef-ONF-Member-Only-1.0

 import sys
 import os
 import json
 import enum
 import time
 import datetime
 import requests
 from pytz import timezone
 from datetime import date
 from collections import namedtuple
 from rancher import Client as RancherClient
 from slack import WebClient as SlackClient
 from slack.errors import SlackApiError


 class Status(enum.Enum):
     healthy = 1
     unhealthy = 2
     error = 3  # check failed
     unavailable = 4  # check not automated


 CONF = json.loads(
     open(os.getenv('CONFIG_FILE', "./config.json")).read(),
     object_hook=lambda d: namedtuple('X', d.keys())(*d.values())
 )

 EMOJI = {
     Status.healthy: CONF.slack.emoji.healthy,
     Status.unhealthy: CONF.slack.emoji.unhealthy,
     Status.error: CONF.slack.emoji.error,
     Status.unavailable: CONF.slack.emoji.unavailable
 }

 HEALTHY_STATES = ["active", "connected"]

 REPORT_HEADER = '''\
 *******************************************************************************
 '''
 REPORT_HEADER += CONF.slack.report_header
 REPORT_HEADER += date.today().strftime(" %d/%m/%Y\n")
 REPORT_HEADER += EMOJI[Status.healthy] + " Healthy   "
 REPORT_HEADER += EMOJI[Status.unhealthy] + " Unhealthy   "
 REPORT_HEADER += EMOJI[Status.error] + " Checks failed   "
 REPORT_HEADER += EMOJI[Status.unavailable] + " Checks not automated\n"
 REPORT_HEADER += '''\
 *******************************************************************************
 '''

 MSG_PREFIX_BLANK = "      :warning-sign: "
 MSG_PREFIX_ERROR = "       [ERROR]: "


 def _get_status(state_string):
     if state_string in HEALTHY_STATES:
         return Status.healthy
     else:
         return Status.unhealthy


 def _get_report(name, status, msg=[]):
     report = []
     if not status:
         return []
     report += [EMOJI[status] + " " + name]
     msg_prefix = MSG_PREFIX_ERROR if status == Status.error else MSG_PREFIX_BLANK
     for m in msg:
         report += [msg_prefix + m]
     return report


 def _run_adb_command(adb_path, ssh_client, command):
     final_command = adb_path + " shell " + command
     stdin, stdout, stderr = ssh_client.exec_command(final_command)
     error = stderr.read().decode("ascii").strip("\n")
     if "Error" in error:
         msg = "failed to run command: " + final_command
         return False, msg
     output = stdout.read().decode("ascii").strip("\n")
     time.sleep(2)
     return True, output


 def get_project_status(cluster, project):
     status = ""
     msg = []

     projects = cluster.projects(name=project.name)
     check_type = project.check_type
     unhealthy = []
     for p in projects:
         targets = p.apps() if check_type == "app" else p.workloads()
         for target in targets.data:
             status = _get_status(target.state)
             if status == Status.unhealthy:
                 warning = check_type + " " + target.name + " is unhealthy"
                 unhealthy += [warning]
     if status == Status.unhealthy:
         msg += unhealthy

     return status, msg


 def get_aether_network_status(edge_name):
     status = ""
     msg = []

     try:
         req_url = CONF.edge_monitoring.api_url + "/" + edge_name
         response = requests.get(req_url)

         if response.status_code == 404:
             return Status.unavailable, []
         response.raise_for_status()
     except Exception as e:
         return Status.error, str(e)

     json_resp = json.loads(response.text)['edge']

     last_update = datetime.datetime.fromtimestamp(json_resp['last_update'])
     time_diff = datetime.datetime.now() - last_update
     time_diff_mins = int(round(time_diff.total_seconds() / 60))
     if time_diff_mins > 10:
         msg += ['status report not received for ' + str(time_diff_mins) + 'min']
         return Status.error, msg

     status = Status.healthy

     cp_status = _get_status(json_resp['status']['control_plane'])
     if cp_status is not Status.healthy:
         status = Status.unhealthy
         msg += ["control plane is not healthy"]

     up_status = _get_status(json_resp['status']['user_plane'])
     if up_status is not Status.healthy:
         status = Status.unhealthy
         msg += ["user plane is not healthy"]

     return status, msg


 def get_k8s_status(cluster):
     status = ""
     msg = []

     status = _get_status(cluster.state)
     if cluster.state == "unavailable" or status is Status.unhealthy:
         msg += [cluster.transitioningMessage]
         return Status.error, msg

     for component in cluster.componentStatuses:
         for condition in component.conditions:
             if condition.status != "True":
                 msg += [component.name + " is unhealthy"]
     for node in cluster.nodes():
         if _get_status(node.state) != Status.healthy:
             msg += [node.hostname + " is unhealthy"]

     return status, msg


 def get_cluster_health_report(cluster, edge=False):
     status = ""
     msg = []
     report = ["*[" + cluster.name + "]*"]

     # Check K8S API health
     k8s_status, msg = get_k8s_status(cluster)
     report += _get_report("Kubernetes", k8s_status, msg)

     # Check managed project health
     for project in CONF.managed_projects:
         status = ""
         msg = []
         if k8s_status == Status.healthy:
             status, msg = get_project_status(cluster, project)
         else:
             status = Status.error
         report += _get_report(project.display, status, msg)

     # Check Aether network health for Edges
     # TODO: separate report for control plane and user plane
     if edge:
         status, msg = get_aether_network_status(cluster.name)
         report += _get_report("Aether Network", status, msg)

     report_string = "\n".join(report)
     return report_string


 def main():
     report = REPORT_HEADER

     # Get cluster status from Rancher
     try:
         rancher_client = RancherClient(
             url=CONF.rancher.api_url, access_key=CONF.rancher.access_key,
             secret_key=CONF.rancher.secret_key)
         response = rancher_client.list_cluster()
     except Exception as e:
         report += _get_report("Rancher", Status.error, [str(e)])
         response = namedtuple('X', "data")([])

     # Check cluster health and make a report
     for cluster in response.data:
         if "production" in cluster.name:
             edge = True if "edge" in cluster.name else False
             report += get_cluster_health_report(cluster, edge)
             report += "\n\n"

     # Publish the report to Slack channel
     try:
         slack_client = SlackClient(token=CONF.slack.api_token)
         response = slack_client.chat_postMessage(
             channel=CONF.slack.channel,
             text=report)
     except SlackApiError as e:
         assert e.response["ok"] is False
         assert e.response["error"]
         print(f"Got an error: {e.response['error']}")


 if __name__ == "__main__":
     main()
	#!/usr/bin/env python

	# Copyright 2020-present Open Networking Foundation
	#
	# SPDX-License-Identifier: LicenseRef-ONF-Member-Only-1.0

	import sys
	import os
	import json
	import enum
	import time
	import datetime
	import requests
	from pytz import timezone
	from datetime import date
	from collections import namedtuple
	from rancher import Client as RancherClient
	from slack import WebClient as SlackClient
	from slack.errors import SlackApiError


	class Status(enum.Enum):
	healthy = 1
	unhealthy = 2
	error = 3 # check failed
	unavailable = 4 # check not automated


	CONF = json.loads(
	open(os.getenv('CONFIG_FILE', "./config.json")).read(),
	object_hook=lambda d: namedtuple('X', d.keys())(*d.values())
	)

	EMOJI = {
	Status.healthy: CONF.slack.emoji.healthy,
	Status.unhealthy: CONF.slack.emoji.unhealthy,
	Status.error: CONF.slack.emoji.error,
	Status.unavailable: CONF.slack.emoji.unavailable
	}

	HEALTHY_STATES = ["active", "connected"]

	REPORT_HEADER = '''\
	*******************************************************************************
	'''
	REPORT_HEADER += CONF.slack.report_header
	REPORT_HEADER += date.today().strftime(" %d/%m/%Y\n")
	REPORT_HEADER += EMOJI[Status.healthy] + " Healthy "
	REPORT_HEADER += EMOJI[Status.unhealthy] + " Unhealthy "
	REPORT_HEADER += EMOJI[Status.error] + " Checks failed "
	REPORT_HEADER += EMOJI[Status.unavailable] + " Checks not automated\n"
	REPORT_HEADER += '''\
	*******************************************************************************
	'''

	MSG_PREFIX_BLANK = " :warning-sign: "
	MSG_PREFIX_ERROR = " [ERROR]: "


	def _get_status(state_string):
	if state_string in HEALTHY_STATES:
	return Status.healthy
	else:
	return Status.unhealthy


	def _get_report(name, status, msg=[]):
	report = []
	if not status:
	return []
	report += [EMOJI[status] + " " + name]
	msg_prefix = MSG_PREFIX_ERROR if status == Status.error else MSG_PREFIX_BLANK
	for m in msg:
	report += [msg_prefix + m]
	return report


	def _run_adb_command(adb_path, ssh_client, command):
	final_command = adb_path + " shell " + command
	stdin, stdout, stderr = ssh_client.exec_command(final_command)
	error = stderr.read().decode("ascii").strip("\n")
	if "Error" in error:
	msg = "failed to run command: " + final_command
	return False, msg
	output = stdout.read().decode("ascii").strip("\n")
	time.sleep(2)
	return True, output


	def get_project_status(cluster, project):
	status = ""
	msg = []

	projects = cluster.projects(name=project.name)
	check_type = project.check_type
	unhealthy = []
	for p in projects:
	targets = p.apps() if check_type == "app" else p.workloads()
	for target in targets.data:
	status = _get_status(target.state)
	if status == Status.unhealthy:
	warning = check_type + " " + target.name + " is unhealthy"
	unhealthy += [warning]
	if status == Status.unhealthy:
	msg += unhealthy

	return status, msg


	def get_aether_network_status(edge_name):
	status = ""
	msg = []

	try:
	req_url = CONF.edge_monitoring.api_url + "/" + edge_name
	response = requests.get(req_url)

	if response.status_code == 404:
	return Status.unavailable, []
	response.raise_for_status()
	except Exception as e:
	return Status.error, str(e)

	json_resp = json.loads(response.text)['edge']

	last_update = datetime.datetime.fromtimestamp(json_resp['last_update'])
	time_diff = datetime.datetime.now() - last_update
	time_diff_mins = int(round(time_diff.total_seconds() / 60))
	if time_diff_mins > 10:
	msg += ['status report not received for ' + str(time_diff_mins) + 'min']
	return Status.error, msg

	status = Status.healthy

	cp_status = _get_status(json_resp['status']['control_plane'])
	if cp_status is not Status.healthy:
	status = Status.unhealthy
	msg += ["control plane is not healthy"]

	up_status = _get_status(json_resp['status']['user_plane'])
	if up_status is not Status.healthy:
	status = Status.unhealthy
	msg += ["user plane is not healthy"]

	return status, msg


	def get_k8s_status(cluster):
	status = ""
	msg = []

	status = _get_status(cluster.state)
	if cluster.state == "unavailable" or status is Status.unhealthy:
	msg += [cluster.transitioningMessage]
	return Status.error, msg

	for component in cluster.componentStatuses:
	for condition in component.conditions:
	if condition.status != "True":
	msg += [component.name + " is unhealthy"]
	for node in cluster.nodes():
	if _get_status(node.state) != Status.healthy:
	msg += [node.hostname + " is unhealthy"]

	return status, msg


	def get_cluster_health_report(cluster, edge=False):
	status = ""
	msg = []
	report = ["[" + cluster.name + "]"]

	# Check K8S API health
	k8s_status, msg = get_k8s_status(cluster)
	report += _get_report("Kubernetes", k8s_status, msg)

	# Check managed project health
	for project in CONF.managed_projects:
	status = ""
	msg = []
	if k8s_status == Status.healthy:
	status, msg = get_project_status(cluster, project)
	else:
	status = Status.error
	report += _get_report(project.display, status, msg)

	# Check Aether network health for Edges
	# TODO: separate report for control plane and user plane
	if edge:
	status, msg = get_aether_network_status(cluster.name)
	report += _get_report("Aether Network", status, msg)

	report_string = "\n".join(report)
	return report_string


	def main():
	report = REPORT_HEADER

	# Get cluster status from Rancher
	try:
	rancher_client = RancherClient(
	url=CONF.rancher.api_url, access_key=CONF.rancher.access_key,
	secret_key=CONF.rancher.secret_key)
	response = rancher_client.list_cluster()
	except Exception as e:
	report += _get_report("Rancher", Status.error, [str(e)])
	response = namedtuple('X', "data")([])

	# Check cluster health and make a report
	for cluster in response.data:
	if "production" in cluster.name:
	edge = True if "edge" in cluster.name else False
	report += get_cluster_health_report(cluster, edge)
	report += "\n\n"

	# Publish the report to Slack channel
	try:
	slack_client = SlackClient(token=CONF.slack.api_token)
	response = slack_client.chat_postMessage(
	channel=CONF.slack.channel,
	text=report)
	except SlackApiError as e:
	assert e.response["ok"] is False
	assert e.response["error"]
	print(f"Got an error: {e.response['error']}")


	if __name__ == "__main__":
	main()