AETHER-8 Add a script for daily status report to Slack channel Change-Id: Ib5e7102d968961e1ca071eaf8a006cc67a4d5c9b

commit: f32ae9a38efd4e0241fcfafdc356e42b041b4dfb [log] [tgz]
author: Hyunsun Moon <hyunsun@opennetworking.org> Thu May 28 13:17:45 2020 -0700
committer: Hyunsun Moon <hyunsun.moon@gmail.com> Sun Jul 26 16:27:13 2020 -0700
tree: 83bc8642b70f032b683a30935373aa84e1684b66
parent: 8f4b18a98f960bac26be0427cc155fd8c34611bc [diff] [blame]
diff --git a/slack-notifier/aether_status_notifier.py b/slack-notifier/aether_status_notifier.py
new file mode 100644
index 0000000..4cd3aa2
--- /dev/null
+++ b/slack-notifier/aether_status_notifier.py

@@ -0,0 +1,240 @@
+#!/usr/bin/env python
+
+# Copyright 2020-present Open Networking Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import json
+import enum
+import time
+import datetime
+import requests
+from pytz import timezone
+from datetime import date
+from collections import namedtuple
+from rancher import Client as RancherClient
+from slack import WebClient as SlackClient
+from slack.errors import SlackApiError
+
+
+class Status(enum.Enum):
+    healthy = 1
+    unhealthy = 2
+    error = 3  # check failed
+    unavailable = 4  # check not automated
+
+
+CONF = json.loads(
+    open(os.getenv('CONFIG_FILE', "./config.json")).read(),
+    object_hook=lambda d: namedtuple('X', d.keys())(*d.values())
+)
+
+EMOJI = {
+    Status.healthy: CONF.slack.emoji.healthy,
+    Status.unhealthy: CONF.slack.emoji.unhealthy,
+    Status.error: CONF.slack.emoji.error,
+    Status.unavailable: CONF.slack.emoji.unavailable
+}
+
+HEALTHY_STATES = ["active", "connected"]
+
+REPORT_HEADER = '''\
+*******************************************************************************
+'''
+REPORT_HEADER += CONF.slack.report_header
+REPORT_HEADER += date.today().strftime(" %d/%m/%Y\n")
+REPORT_HEADER += EMOJI[Status.healthy] + " Healthy   "
+REPORT_HEADER += EMOJI[Status.unhealthy] + " Unhealthy   "
+REPORT_HEADER += EMOJI[Status.error] + " Checks failed   "
+REPORT_HEADER += EMOJI[Status.unavailable] + " Checks not automated\n"
+REPORT_HEADER += '''\
+*******************************************************************************
+'''
+
+MSG_PREFIX_BLANK = "      :warning-sign: "
+MSG_PREFIX_ERROR = "       [ERROR]: "
+
+
+def _get_status(state_string):
+    if state_string in HEALTHY_STATES:
+        return Status.healthy
+    else:
+        return Status.unhealthy
+
+
+def _get_report(name, status, msg=[]):
+    report = []
+    if not status:
+        return []
+    report += [EMOJI[status] + " " + name]
+    msg_prefix = MSG_PREFIX_ERROR if status == Status.error else MSG_PREFIX_BLANK
+    for m in msg:
+        report += [msg_prefix + m]
+    return report
+
+
+def _run_adb_command(adb_path, ssh_client, command):
+    final_command = adb_path + " shell " + command
+    stdin, stdout, stderr = ssh_client.exec_command(final_command)
+    error = stderr.read().decode("ascii").strip("\n")
+    if "Error" in error:
+        msg = "failed to run command: " + final_command
+        return False, msg
+    output = stdout.read().decode("ascii").strip("\n")
+    time.sleep(2)
+    return True, output
+
+
+def get_project_status(cluster, project):
+    status = ""
+    msg = []
+
+    projects = cluster.projects(name=project.name)
+    check_type = project.check_type
+    unhealthy = []
+    for p in projects:
+        targets = p.apps() if check_type == "app" else p.workloads()
+        for target in targets.data:
+            status = _get_status(target.state)
+            if status == Status.unhealthy:
+                warning = check_type + " " + target.name + " is unhealthy"
+                unhealthy += [warning]
+    if status == Status.unhealthy:
+        msg += unhealthy
+
+    return status, msg
+
+
+def get_aether_network_status(edge_name):
+    status = ""
+    msg = []
+
+    try:
+        req_url = CONF.edge_monitoring.api_url + "/" + edge_name
+        response = requests.get(req_url)
+
+        if response.status_code == 404:
+            return Status.unavailable, []
+        response.raise_for_status()
+    except Exception as e:
+        return Status.error, str(e)
+
+    json_resp = json.loads(response.text)['edge']
+
+    last_update = datetime.datetime.fromtimestamp(json_resp['last_update'])
+    time_diff = datetime.datetime.now() - last_update
+    time_diff_mins = int(round(time_diff.total_seconds() / 60))
+    if time_diff_mins > 10:
+        msg += ['status report not received for ' + str(time_diff_mins) + 'min']
+        return Status.error, msg
+
+    status = Status.healthy
+
+    cp_status = _get_status(json_resp['status']['control_plane'])
+    if cp_status is not Status.healthy:
+        status = Status.unhealthy
+        msg += ["control plane is not healthy"]
+
+    up_status = _get_status(json_resp['status']['user_plane'])
+    if up_status is not Status.healthy:
+        status = Status.unhealthy
+        msg += ["user plane is not healthy"]
+
+    return status, msg
+
+
+def get_k8s_status(cluster):
+    status = ""
+    msg = []
+
+    status = _get_status(cluster.state)
+    if cluster.state == "unavailable" or status is Status.unhealthy:
+        msg += [cluster.transitioningMessage]
+        return Status.error, msg
+
+    for component in cluster.componentStatuses:
+        for condition in component.conditions:
+            if condition.status != "True":
+                msg += [component.name + " is unhealthy"]
+    for node in cluster.nodes():
+        if _get_status(node.state) != Status.healthy:
+            msg += [node.hostname + " is unhealthy"]
+
+    return status, msg
+
+
+def get_cluster_health_report(cluster, edge=False):
+    status = ""
+    msg = []
+    report = ["*[" + cluster.name + "]*"]
+
+    # Check K8S API health
+    k8s_status, msg = get_k8s_status(cluster)
+    report += _get_report("Kubernetes", k8s_status, msg)
+
+    # Check managed project health
+    for project in CONF.managed_projects:
+        status = ""
+        msg = []
+        if k8s_status == Status.healthy:
+            status, msg = get_project_status(cluster, project)
+        else:
+            status = Status.error
+        report += _get_report(project.display, status, msg)
+
+    # Check Aether network health for Edges
+    # TODO: separate report for control plane and user plane
+    if edge:
+        status, msg = get_aether_network_status(cluster.name)
+        report += _get_report("Aether Network", status, msg)
+
+    report_string = "\n".join(report)
+    return report_string
+
+
+def main():
+    report = REPORT_HEADER
+
+    # Get cluster status from Rancher
+    try:
+        rancher_client = RancherClient(
+            url=CONF.rancher.api_url, access_key=CONF.rancher.access_key,
+            secret_key=CONF.rancher.secret_key)
+        response = rancher_client.list_cluster()
+    except Exception as e:
+        report += _get_report("Rancher", Status.error, [str(e)])
+        response = namedtuple('X', "data")([])
+
+    # Check cluster health and make a report
+    for cluster in response.data:
+        if "production" in cluster.name:
+            edge = True if "edge" in cluster.name else False
+            report += get_cluster_health_report(cluster, edge)
+            report += "\n\n"
+
+    # Publish the report to Slack channel
+    try:
+        slack_client = SlackClient(token=CONF.slack.api_token)
+        response = slack_client.chat_postMessage(
+            channel=CONF.slack.channel,
+            text=report)
+    except SlackApiError as e:
+        assert e.response["ok"] is False
+        assert e.response["error"]
+        print(f"Got an error: {e.response['error']}")
+
+
+if __name__ == "__main__":
+    main()
commit	f32ae9a38efd4e0241fcfafdc356e42b041b4dfb	[log] [tgz]
author	Hyunsun Moon <hyunsun@opennetworking.org>	Thu May 28 13:17:45 2020 -0700
committer	Hyunsun Moon <hyunsun.moon@gmail.com>	Sun Jul 26 16:27:13 2020 -0700
tree	83bc8642b70f032b683a30935373aa84e1684b66
parent	8f4b18a98f960bac26be0427cc155fd8c34611bc [diff] [blame]