AETHER-8 Add a script for daily status report to Slack channel
Change-Id: Ib5e7102d968961e1ca071eaf8a006cc67a4d5c9b
diff --git a/slack-notifier/aether_status_notifier.py b/slack-notifier/aether_status_notifier.py
new file mode 100644
index 0000000..4cd3aa2
--- /dev/null
+++ b/slack-notifier/aether_status_notifier.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python
+
+# Copyright 2020-present Open Networking Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import json
+import enum
+import time
+import datetime
+import requests
+from pytz import timezone
+from datetime import date
+from collections import namedtuple
+from rancher import Client as RancherClient
+from slack import WebClient as SlackClient
+from slack.errors import SlackApiError
+
+
+class Status(enum.Enum):
+ healthy = 1
+ unhealthy = 2
+ error = 3 # check failed
+ unavailable = 4 # check not automated
+
+
+CONF = json.loads(
+ open(os.getenv('CONFIG_FILE', "./config.json")).read(),
+ object_hook=lambda d: namedtuple('X', d.keys())(*d.values())
+)
+
+EMOJI = {
+ Status.healthy: CONF.slack.emoji.healthy,
+ Status.unhealthy: CONF.slack.emoji.unhealthy,
+ Status.error: CONF.slack.emoji.error,
+ Status.unavailable: CONF.slack.emoji.unavailable
+}
+
+HEALTHY_STATES = ["active", "connected"]
+
+REPORT_HEADER = '''\
+*******************************************************************************
+'''
+REPORT_HEADER += CONF.slack.report_header
+REPORT_HEADER += date.today().strftime(" %d/%m/%Y\n")
+REPORT_HEADER += EMOJI[Status.healthy] + " Healthy "
+REPORT_HEADER += EMOJI[Status.unhealthy] + " Unhealthy "
+REPORT_HEADER += EMOJI[Status.error] + " Checks failed "
+REPORT_HEADER += EMOJI[Status.unavailable] + " Checks not automated\n"
+REPORT_HEADER += '''\
+*******************************************************************************
+'''
+
+MSG_PREFIX_BLANK = " :warning-sign: "
+MSG_PREFIX_ERROR = " [ERROR]: "
+
+
+def _get_status(state_string):
+ if state_string in HEALTHY_STATES:
+ return Status.healthy
+ else:
+ return Status.unhealthy
+
+
+def _get_report(name, status, msg=[]):
+ report = []
+ if not status:
+ return []
+ report += [EMOJI[status] + " " + name]
+ msg_prefix = MSG_PREFIX_ERROR if status == Status.error else MSG_PREFIX_BLANK
+ for m in msg:
+ report += [msg_prefix + m]
+ return report
+
+
+def _run_adb_command(adb_path, ssh_client, command):
+ final_command = adb_path + " shell " + command
+ stdin, stdout, stderr = ssh_client.exec_command(final_command)
+ error = stderr.read().decode("ascii").strip("\n")
+ if "Error" in error:
+ msg = "failed to run command: " + final_command
+ return False, msg
+ output = stdout.read().decode("ascii").strip("\n")
+ time.sleep(2)
+ return True, output
+
+
+def get_project_status(cluster, project):
+ status = ""
+ msg = []
+
+ projects = cluster.projects(name=project.name)
+ check_type = project.check_type
+ unhealthy = []
+ for p in projects:
+ targets = p.apps() if check_type == "app" else p.workloads()
+ for target in targets.data:
+ status = _get_status(target.state)
+ if status == Status.unhealthy:
+ warning = check_type + " " + target.name + " is unhealthy"
+ unhealthy += [warning]
+ if status == Status.unhealthy:
+ msg += unhealthy
+
+ return status, msg
+
+
+def get_aether_network_status(edge_name):
+ status = ""
+ msg = []
+
+ try:
+ req_url = CONF.edge_monitoring.api_url + "/" + edge_name
+ response = requests.get(req_url)
+
+ if response.status_code == 404:
+ return Status.unavailable, []
+ response.raise_for_status()
+ except Exception as e:
+ return Status.error, str(e)
+
+ json_resp = json.loads(response.text)['edge']
+
+ last_update = datetime.datetime.fromtimestamp(json_resp['last_update'])
+ time_diff = datetime.datetime.now() - last_update
+ time_diff_mins = int(round(time_diff.total_seconds() / 60))
+ if time_diff_mins > 10:
+ msg += ['status report not received for ' + str(time_diff_mins) + 'min']
+ return Status.error, msg
+
+ status = Status.healthy
+
+ cp_status = _get_status(json_resp['status']['control_plane'])
+ if cp_status is not Status.healthy:
+ status = Status.unhealthy
+ msg += ["control plane is not healthy"]
+
+ up_status = _get_status(json_resp['status']['user_plane'])
+ if up_status is not Status.healthy:
+ status = Status.unhealthy
+ msg += ["user plane is not healthy"]
+
+ return status, msg
+
+
+def get_k8s_status(cluster):
+ status = ""
+ msg = []
+
+ status = _get_status(cluster.state)
+ if cluster.state == "unavailable" or status is Status.unhealthy:
+ msg += [cluster.transitioningMessage]
+ return Status.error, msg
+
+ for component in cluster.componentStatuses:
+ for condition in component.conditions:
+ if condition.status != "True":
+ msg += [component.name + " is unhealthy"]
+ for node in cluster.nodes():
+ if _get_status(node.state) != Status.healthy:
+ msg += [node.hostname + " is unhealthy"]
+
+ return status, msg
+
+
+def get_cluster_health_report(cluster, edge=False):
+ status = ""
+ msg = []
+ report = ["*[" + cluster.name + "]*"]
+
+ # Check K8S API health
+ k8s_status, msg = get_k8s_status(cluster)
+ report += _get_report("Kubernetes", k8s_status, msg)
+
+ # Check managed project health
+ for project in CONF.managed_projects:
+ status = ""
+ msg = []
+ if k8s_status == Status.healthy:
+ status, msg = get_project_status(cluster, project)
+ else:
+ status = Status.error
+ report += _get_report(project.display, status, msg)
+
+ # Check Aether network health for Edges
+ # TODO: separate report for control plane and user plane
+ if edge:
+ status, msg = get_aether_network_status(cluster.name)
+ report += _get_report("Aether Network", status, msg)
+
+ report_string = "\n".join(report)
+ return report_string
+
+
+def main():
+ report = REPORT_HEADER
+
+ # Get cluster status from Rancher
+ try:
+ rancher_client = RancherClient(
+ url=CONF.rancher.api_url, access_key=CONF.rancher.access_key,
+ secret_key=CONF.rancher.secret_key)
+ response = rancher_client.list_cluster()
+ except Exception as e:
+ report += _get_report("Rancher", Status.error, [str(e)])
+ response = namedtuple('X', "data")([])
+
+ # Check cluster health and make a report
+ for cluster in response.data:
+ if "production" in cluster.name:
+ edge = True if "edge" in cluster.name else False
+ report += get_cluster_health_report(cluster, edge)
+ report += "\n\n"
+
+ # Publish the report to Slack channel
+ try:
+ slack_client = SlackClient(token=CONF.slack.api_token)
+ response = slack_client.chat_postMessage(
+ channel=CONF.slack.channel,
+ text=report)
+ except SlackApiError as e:
+ assert e.response["ok"] is False
+ assert e.response["error"]
+ print(f"Got an error: {e.response['error']}")
+
+
+if __name__ == "__main__":
+ main()