| #!/usr/bin/env python |
| |
| # Copyright 2020-present Open Networking Foundation |
| # |
| # SPDX-License-Identifier: LicenseRef-ONF-Member-Only-1.0 |
| |
| import sys |
| import os |
| import json |
| import enum |
| import time |
| import datetime |
| import requests |
| from pytz import timezone |
| from datetime import date |
| from collections import namedtuple |
| from rancher import Client as RancherClient |
| from slack import WebClient as SlackClient |
| from slack.errors import SlackApiError |
| |
| |
| class Status(enum.Enum): |
| healthy = 1 |
| unhealthy = 2 |
| error = 3 # check failed |
| unavailable = 4 # check not automated |
| |
| |
| CONF = json.loads( |
| open(os.getenv('CONFIG_FILE', "./config.json")).read(), |
| object_hook=lambda d: namedtuple('X', d.keys())(*d.values()) |
| ) |
| |
| EMOJI = { |
| Status.healthy: CONF.slack.emoji.healthy, |
| Status.unhealthy: CONF.slack.emoji.unhealthy, |
| Status.error: CONF.slack.emoji.error, |
| Status.unavailable: CONF.slack.emoji.unavailable |
| } |
| |
| HEALTHY_STATES = ["active", "connected"] |
| |
| REPORT_HEADER = '''\ |
| ******************************************************************************* |
| ''' |
| REPORT_HEADER += CONF.slack.report_header |
| REPORT_HEADER += date.today().strftime(" %d/%m/%Y\n") |
| REPORT_HEADER += EMOJI[Status.healthy] + " Healthy " |
| REPORT_HEADER += EMOJI[Status.unhealthy] + " Unhealthy " |
| REPORT_HEADER += EMOJI[Status.error] + " Checks failed " |
| REPORT_HEADER += EMOJI[Status.unavailable] + " Checks not automated\n" |
| REPORT_HEADER += '''\ |
| ******************************************************************************* |
| ''' |
| |
| MSG_PREFIX_BLANK = " :warning-sign: " |
| MSG_PREFIX_ERROR = " [ERROR]: " |
| |
| |
| def _get_status(state_string): |
| if state_string in HEALTHY_STATES: |
| return Status.healthy |
| else: |
| return Status.unhealthy |
| |
| |
| def _get_report(name, status, msg=[]): |
| report = [] |
| if not status: |
| return [] |
| report += [EMOJI[status] + " " + name] |
| msg_prefix = MSG_PREFIX_ERROR if status == Status.error else MSG_PREFIX_BLANK |
| for m in msg: |
| report += [msg_prefix + m] |
| return report |
| |
| |
| def _run_adb_command(adb_path, ssh_client, command): |
| final_command = adb_path + " shell " + command |
| stdin, stdout, stderr = ssh_client.exec_command(final_command) |
| error = stderr.read().decode("ascii").strip("\n") |
| if "Error" in error: |
| msg = "failed to run command: " + final_command |
| return False, msg |
| output = stdout.read().decode("ascii").strip("\n") |
| time.sleep(2) |
| return True, output |
| |
| |
| def get_project_status(cluster, project): |
| status = "" |
| msg = [] |
| |
| projects = cluster.projects(name=project.name) |
| check_type = project.check_type |
| unhealthy = [] |
| for p in projects: |
| targets = p.apps() if check_type == "app" else p.workloads() |
| for target in targets.data: |
| status = _get_status(target.state) |
| if status == Status.unhealthy: |
| warning = check_type + " " + target.name + " is unhealthy" |
| unhealthy += [warning] |
| if status == Status.unhealthy: |
| msg += unhealthy |
| |
| return status, msg |
| |
| |
| def get_aether_network_status(edge_name): |
| status = "" |
| msg = [] |
| |
| try: |
| req_url = CONF.edge_monitoring.api_url + "/" + edge_name |
| response = requests.get(req_url) |
| |
| if response.status_code == 404: |
| return Status.unavailable, [] |
| response.raise_for_status() |
| except Exception as e: |
| return Status.error, str(e) |
| |
| json_resp = json.loads(response.text)['edge'] |
| |
| last_update = datetime.datetime.fromtimestamp(json_resp['last_update']) |
| time_diff = datetime.datetime.now() - last_update |
| time_diff_mins = int(round(time_diff.total_seconds() / 60)) |
| if time_diff_mins > 10: |
| msg += ['status report not received for ' + str(time_diff_mins) + 'min'] |
| return Status.error, msg |
| |
| status = Status.healthy |
| |
| cp_status = _get_status(json_resp['status']['control_plane']) |
| if cp_status is not Status.healthy: |
| status = Status.unhealthy |
| msg += ["control plane is not healthy"] |
| |
| up_status = _get_status(json_resp['status']['user_plane']) |
| if up_status is not Status.healthy: |
| status = Status.unhealthy |
| msg += ["user plane is not healthy"] |
| |
| return status, msg |
| |
| |
| def get_k8s_status(cluster): |
| status = "" |
| msg = [] |
| |
| status = _get_status(cluster.state) |
| if cluster.state == "unavailable" or status is Status.unhealthy: |
| msg += [cluster.transitioningMessage] |
| return Status.error, msg |
| |
| for component in cluster.componentStatuses: |
| for condition in component.conditions: |
| if condition.status != "True": |
| msg += [component.name + " is unhealthy"] |
| for node in cluster.nodes(): |
| if _get_status(node.state) != Status.healthy: |
| msg += [node.hostname + " is unhealthy"] |
| |
| return status, msg |
| |
| |
| def get_cluster_health_report(cluster, edge=False): |
| status = "" |
| msg = [] |
| report = ["*[" + cluster.name + "]*"] |
| |
| # Check K8S API health |
| k8s_status, msg = get_k8s_status(cluster) |
| report += _get_report("Kubernetes", k8s_status, msg) |
| |
| # Check managed project health |
| for project in CONF.managed_projects: |
| status = "" |
| msg = [] |
| if k8s_status == Status.healthy: |
| status, msg = get_project_status(cluster, project) |
| else: |
| status = Status.error |
| report += _get_report(project.display, status, msg) |
| |
| # Check Aether network health for Edges |
| # TODO: separate report for control plane and user plane |
| if edge: |
| status, msg = get_aether_network_status(cluster.name) |
| report += _get_report("Aether Network", status, msg) |
| |
| report_string = "\n".join(report) |
| return report_string |
| |
| |
| def main(): |
| report = REPORT_HEADER |
| |
| # Get cluster status from Rancher |
| try: |
| rancher_client = RancherClient( |
| url=CONF.rancher.api_url, access_key=CONF.rancher.access_key, |
| secret_key=CONF.rancher.secret_key) |
| response = rancher_client.list_cluster() |
| except Exception as e: |
| report += _get_report("Rancher", Status.error, [str(e)]) |
| response = namedtuple('X', "data")([]) |
| |
| # Check cluster health and make a report |
| for cluster in response.data: |
| if "production" in cluster.name: |
| edge = True if "edge" in cluster.name else False |
| report += get_cluster_health_report(cluster, edge) |
| report += "\n\n" |
| |
| # Publish the report to Slack channel |
| try: |
| slack_client = SlackClient(token=CONF.slack.api_token) |
| response = slack_client.chat_postMessage( |
| channel=CONF.slack.channel, |
| text=report) |
| except SlackApiError as e: |
| assert e.response["ok"] is False |
| assert e.response["error"] |
| print(f"Got an error: {e.response['error']}") |
| |
| |
| if __name__ == "__main__": |
| main() |