blob: 4cd3aa2726b9550d75fdd3cfec3e2423f20ce1b9 [file] [log] [blame]
Hyunsun Moonf32ae9a2020-05-28 13:17:45 -07001#!/usr/bin/env python
2
3# Copyright 2020-present Open Networking Foundation
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17import sys
18import os
19import json
20import enum
21import time
22import datetime
23import requests
24from pytz import timezone
25from datetime import date
26from collections import namedtuple
27from rancher import Client as RancherClient
28from slack import WebClient as SlackClient
29from slack.errors import SlackApiError
30
31
32class Status(enum.Enum):
33 healthy = 1
34 unhealthy = 2
35 error = 3 # check failed
36 unavailable = 4 # check not automated
37
38
39CONF = json.loads(
40 open(os.getenv('CONFIG_FILE', "./config.json")).read(),
41 object_hook=lambda d: namedtuple('X', d.keys())(*d.values())
42)
43
44EMOJI = {
45 Status.healthy: CONF.slack.emoji.healthy,
46 Status.unhealthy: CONF.slack.emoji.unhealthy,
47 Status.error: CONF.slack.emoji.error,
48 Status.unavailable: CONF.slack.emoji.unavailable
49}
50
51HEALTHY_STATES = ["active", "connected"]
52
53REPORT_HEADER = '''\
54*******************************************************************************
55'''
56REPORT_HEADER += CONF.slack.report_header
57REPORT_HEADER += date.today().strftime(" %d/%m/%Y\n")
58REPORT_HEADER += EMOJI[Status.healthy] + " Healthy "
59REPORT_HEADER += EMOJI[Status.unhealthy] + " Unhealthy "
60REPORT_HEADER += EMOJI[Status.error] + " Checks failed "
61REPORT_HEADER += EMOJI[Status.unavailable] + " Checks not automated\n"
62REPORT_HEADER += '''\
63*******************************************************************************
64'''
65
66MSG_PREFIX_BLANK = " :warning-sign: "
67MSG_PREFIX_ERROR = " [ERROR]: "
68
69
70def _get_status(state_string):
71 if state_string in HEALTHY_STATES:
72 return Status.healthy
73 else:
74 return Status.unhealthy
75
76
77def _get_report(name, status, msg=[]):
78 report = []
79 if not status:
80 return []
81 report += [EMOJI[status] + " " + name]
82 msg_prefix = MSG_PREFIX_ERROR if status == Status.error else MSG_PREFIX_BLANK
83 for m in msg:
84 report += [msg_prefix + m]
85 return report
86
87
88def _run_adb_command(adb_path, ssh_client, command):
89 final_command = adb_path + " shell " + command
90 stdin, stdout, stderr = ssh_client.exec_command(final_command)
91 error = stderr.read().decode("ascii").strip("\n")
92 if "Error" in error:
93 msg = "failed to run command: " + final_command
94 return False, msg
95 output = stdout.read().decode("ascii").strip("\n")
96 time.sleep(2)
97 return True, output
98
99
100def get_project_status(cluster, project):
101 status = ""
102 msg = []
103
104 projects = cluster.projects(name=project.name)
105 check_type = project.check_type
106 unhealthy = []
107 for p in projects:
108 targets = p.apps() if check_type == "app" else p.workloads()
109 for target in targets.data:
110 status = _get_status(target.state)
111 if status == Status.unhealthy:
112 warning = check_type + " " + target.name + " is unhealthy"
113 unhealthy += [warning]
114 if status == Status.unhealthy:
115 msg += unhealthy
116
117 return status, msg
118
119
120def get_aether_network_status(edge_name):
121 status = ""
122 msg = []
123
124 try:
125 req_url = CONF.edge_monitoring.api_url + "/" + edge_name
126 response = requests.get(req_url)
127
128 if response.status_code == 404:
129 return Status.unavailable, []
130 response.raise_for_status()
131 except Exception as e:
132 return Status.error, str(e)
133
134 json_resp = json.loads(response.text)['edge']
135
136 last_update = datetime.datetime.fromtimestamp(json_resp['last_update'])
137 time_diff = datetime.datetime.now() - last_update
138 time_diff_mins = int(round(time_diff.total_seconds() / 60))
139 if time_diff_mins > 10:
140 msg += ['status report not received for ' + str(time_diff_mins) + 'min']
141 return Status.error, msg
142
143 status = Status.healthy
144
145 cp_status = _get_status(json_resp['status']['control_plane'])
146 if cp_status is not Status.healthy:
147 status = Status.unhealthy
148 msg += ["control plane is not healthy"]
149
150 up_status = _get_status(json_resp['status']['user_plane'])
151 if up_status is not Status.healthy:
152 status = Status.unhealthy
153 msg += ["user plane is not healthy"]
154
155 return status, msg
156
157
158def get_k8s_status(cluster):
159 status = ""
160 msg = []
161
162 status = _get_status(cluster.state)
163 if cluster.state == "unavailable" or status is Status.unhealthy:
164 msg += [cluster.transitioningMessage]
165 return Status.error, msg
166
167 for component in cluster.componentStatuses:
168 for condition in component.conditions:
169 if condition.status != "True":
170 msg += [component.name + " is unhealthy"]
171 for node in cluster.nodes():
172 if _get_status(node.state) != Status.healthy:
173 msg += [node.hostname + " is unhealthy"]
174
175 return status, msg
176
177
178def get_cluster_health_report(cluster, edge=False):
179 status = ""
180 msg = []
181 report = ["*[" + cluster.name + "]*"]
182
183 # Check K8S API health
184 k8s_status, msg = get_k8s_status(cluster)
185 report += _get_report("Kubernetes", k8s_status, msg)
186
187 # Check managed project health
188 for project in CONF.managed_projects:
189 status = ""
190 msg = []
191 if k8s_status == Status.healthy:
192 status, msg = get_project_status(cluster, project)
193 else:
194 status = Status.error
195 report += _get_report(project.display, status, msg)
196
197 # Check Aether network health for Edges
198 # TODO: separate report for control plane and user plane
199 if edge:
200 status, msg = get_aether_network_status(cluster.name)
201 report += _get_report("Aether Network", status, msg)
202
203 report_string = "\n".join(report)
204 return report_string
205
206
207def main():
208 report = REPORT_HEADER
209
210 # Get cluster status from Rancher
211 try:
212 rancher_client = RancherClient(
213 url=CONF.rancher.api_url, access_key=CONF.rancher.access_key,
214 secret_key=CONF.rancher.secret_key)
215 response = rancher_client.list_cluster()
216 except Exception as e:
217 report += _get_report("Rancher", Status.error, [str(e)])
218 response = namedtuple('X', "data")([])
219
220 # Check cluster health and make a report
221 for cluster in response.data:
222 if "production" in cluster.name:
223 edge = True if "edge" in cluster.name else False
224 report += get_cluster_health_report(cluster, edge)
225 report += "\n\n"
226
227 # Publish the report to Slack channel
228 try:
229 slack_client = SlackClient(token=CONF.slack.api_token)
230 response = slack_client.chat_postMessage(
231 channel=CONF.slack.channel,
232 text=report)
233 except SlackApiError as e:
234 assert e.response["ok"] is False
235 assert e.response["error"]
236 print(f"Got an error: {e.response['error']}")
237
238
239if __name__ == "__main__":
240 main()