blob: 9048349748f684a8b8e9d4bd2f2c9b7d001686d5 [file] [log] [blame]
Hyunsun Moonf32ae9a2020-05-28 13:17:45 -07001#!/usr/bin/env python
2
3# Copyright 2020-present Open Networking Foundation
4#
Jeremy Ronquillo71edcb72021-06-01 12:50:58 -07005# SPDX-License-Identifier: LicenseRef-ONF-Member-Only-1.0
Hyunsun Moonf32ae9a2020-05-28 13:17:45 -07006
7import sys
8import os
9import json
10import enum
11import time
12import datetime
13import requests
14from pytz import timezone
15from datetime import date
16from collections import namedtuple
17from rancher import Client as RancherClient
18from slack import WebClient as SlackClient
19from slack.errors import SlackApiError
20
21
22class Status(enum.Enum):
23 healthy = 1
24 unhealthy = 2
25 error = 3 # check failed
26 unavailable = 4 # check not automated
27
28
29CONF = json.loads(
30 open(os.getenv('CONFIG_FILE', "./config.json")).read(),
31 object_hook=lambda d: namedtuple('X', d.keys())(*d.values())
32)
33
34EMOJI = {
35 Status.healthy: CONF.slack.emoji.healthy,
36 Status.unhealthy: CONF.slack.emoji.unhealthy,
37 Status.error: CONF.slack.emoji.error,
38 Status.unavailable: CONF.slack.emoji.unavailable
39}
40
41HEALTHY_STATES = ["active", "connected"]
42
43REPORT_HEADER = '''\
44*******************************************************************************
45'''
46REPORT_HEADER += CONF.slack.report_header
47REPORT_HEADER += date.today().strftime(" %d/%m/%Y\n")
48REPORT_HEADER += EMOJI[Status.healthy] + " Healthy "
49REPORT_HEADER += EMOJI[Status.unhealthy] + " Unhealthy "
50REPORT_HEADER += EMOJI[Status.error] + " Checks failed "
51REPORT_HEADER += EMOJI[Status.unavailable] + " Checks not automated\n"
52REPORT_HEADER += '''\
53*******************************************************************************
54'''
55
56MSG_PREFIX_BLANK = " :warning-sign: "
57MSG_PREFIX_ERROR = " [ERROR]: "
58
59
60def _get_status(state_string):
61 if state_string in HEALTHY_STATES:
62 return Status.healthy
63 else:
64 return Status.unhealthy
65
66
67def _get_report(name, status, msg=[]):
68 report = []
69 if not status:
70 return []
71 report += [EMOJI[status] + " " + name]
72 msg_prefix = MSG_PREFIX_ERROR if status == Status.error else MSG_PREFIX_BLANK
73 for m in msg:
74 report += [msg_prefix + m]
75 return report
76
77
78def _run_adb_command(adb_path, ssh_client, command):
79 final_command = adb_path + " shell " + command
80 stdin, stdout, stderr = ssh_client.exec_command(final_command)
81 error = stderr.read().decode("ascii").strip("\n")
82 if "Error" in error:
83 msg = "failed to run command: " + final_command
84 return False, msg
85 output = stdout.read().decode("ascii").strip("\n")
86 time.sleep(2)
87 return True, output
88
89
90def get_project_status(cluster, project):
91 status = ""
92 msg = []
93
94 projects = cluster.projects(name=project.name)
95 check_type = project.check_type
96 unhealthy = []
97 for p in projects:
98 targets = p.apps() if check_type == "app" else p.workloads()
99 for target in targets.data:
100 status = _get_status(target.state)
101 if status == Status.unhealthy:
102 warning = check_type + " " + target.name + " is unhealthy"
103 unhealthy += [warning]
104 if status == Status.unhealthy:
105 msg += unhealthy
106
107 return status, msg
108
109
110def get_aether_network_status(edge_name):
111 status = ""
112 msg = []
113
114 try:
115 req_url = CONF.edge_monitoring.api_url + "/" + edge_name
116 response = requests.get(req_url)
117
118 if response.status_code == 404:
119 return Status.unavailable, []
120 response.raise_for_status()
121 except Exception as e:
122 return Status.error, str(e)
123
124 json_resp = json.loads(response.text)['edge']
125
126 last_update = datetime.datetime.fromtimestamp(json_resp['last_update'])
127 time_diff = datetime.datetime.now() - last_update
128 time_diff_mins = int(round(time_diff.total_seconds() / 60))
129 if time_diff_mins > 10:
130 msg += ['status report not received for ' + str(time_diff_mins) + 'min']
131 return Status.error, msg
132
133 status = Status.healthy
134
135 cp_status = _get_status(json_resp['status']['control_plane'])
136 if cp_status is not Status.healthy:
137 status = Status.unhealthy
138 msg += ["control plane is not healthy"]
139
140 up_status = _get_status(json_resp['status']['user_plane'])
141 if up_status is not Status.healthy:
142 status = Status.unhealthy
143 msg += ["user plane is not healthy"]
144
145 return status, msg
146
147
148def get_k8s_status(cluster):
149 status = ""
150 msg = []
151
152 status = _get_status(cluster.state)
153 if cluster.state == "unavailable" or status is Status.unhealthy:
154 msg += [cluster.transitioningMessage]
155 return Status.error, msg
156
157 for component in cluster.componentStatuses:
158 for condition in component.conditions:
159 if condition.status != "True":
160 msg += [component.name + " is unhealthy"]
161 for node in cluster.nodes():
162 if _get_status(node.state) != Status.healthy:
163 msg += [node.hostname + " is unhealthy"]
164
165 return status, msg
166
167
168def get_cluster_health_report(cluster, edge=False):
169 status = ""
170 msg = []
171 report = ["*[" + cluster.name + "]*"]
172
173 # Check K8S API health
174 k8s_status, msg = get_k8s_status(cluster)
175 report += _get_report("Kubernetes", k8s_status, msg)
176
177 # Check managed project health
178 for project in CONF.managed_projects:
179 status = ""
180 msg = []
181 if k8s_status == Status.healthy:
182 status, msg = get_project_status(cluster, project)
183 else:
184 status = Status.error
185 report += _get_report(project.display, status, msg)
186
187 # Check Aether network health for Edges
188 # TODO: separate report for control plane and user plane
189 if edge:
190 status, msg = get_aether_network_status(cluster.name)
191 report += _get_report("Aether Network", status, msg)
192
193 report_string = "\n".join(report)
194 return report_string
195
196
197def main():
198 report = REPORT_HEADER
199
200 # Get cluster status from Rancher
201 try:
202 rancher_client = RancherClient(
203 url=CONF.rancher.api_url, access_key=CONF.rancher.access_key,
204 secret_key=CONF.rancher.secret_key)
205 response = rancher_client.list_cluster()
206 except Exception as e:
207 report += _get_report("Rancher", Status.error, [str(e)])
208 response = namedtuple('X', "data")([])
209
210 # Check cluster health and make a report
211 for cluster in response.data:
212 if "production" in cluster.name:
213 edge = True if "edge" in cluster.name else False
214 report += get_cluster_health_report(cluster, edge)
215 report += "\n\n"
216
217 # Publish the report to Slack channel
218 try:
219 slack_client = SlackClient(token=CONF.slack.api_token)
220 response = slack_client.chat_postMessage(
221 channel=CONF.slack.channel,
222 text=report)
223 except SlackApiError as e:
224 assert e.response["ok"] is False
225 assert e.response["error"]
226 print(f"Got an error: {e.response['error']}")
227
228
229if __name__ == "__main__":
230 main()