AETHER-8 Add a script for daily status report to Slack channel
Change-Id: Ib5e7102d968961e1ca071eaf8a006cc67a4d5c9b
diff --git a/edge-monitoring/Dockerfile.server b/edge-monitoring/Dockerfile.server
new file mode 100644
index 0000000..3648f2b
--- /dev/null
+++ b/edge-monitoring/Dockerfile.server
@@ -0,0 +1,22 @@
+# Copyright 2020-present Open Networking Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+FROM python:3.7-slim
+
+WORKDIR /usr/src/app
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+COPY edge_monitoring_server.py ./
+
+CMD ["python", "edge_monitoring_server.py"]
diff --git a/edge-monitoring/config.json b/edge-monitoring/config.json
new file mode 100644
index 0000000..8f8a051
--- /dev/null
+++ b/edge-monitoring/config.json
@@ -0,0 +1,11 @@
+{
+ "edge_name": "production-edge-onf-menlo",
+ "adb": {
+ "path": "/usr/local/bin/adb",
+ "apn_mode_toggle_location": {
+ "x": "550",
+ "y": "700"
+ }
+ },
+ "report_url": "https://aether.onlab.us/edges"
+}
diff --git a/edge-monitoring/edge_monitoring_agent.py b/edge-monitoring/edge_monitoring_agent.py
new file mode 100644
index 0000000..bcfd5b5
--- /dev/null
+++ b/edge-monitoring/edge_monitoring_agent.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python
+
+# Copyright 2020-present Open Networking Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import os
+import time
+import requests
+import json
+import enum
+import daemon
+from collections import namedtuple
+from pyadb import ADB
+
+'''
+Check Aether network operational status and report it to
+central monitoring server
+
+1) check mobile connctivity after toggling the airplane mode
+2) check if ping to 8.8.8.8 works
+'''
+
+CONF = json.loads(
+ open(os.getenv('CONFIG_FILE', "./config.json")).read(),
+ object_hook=lambda d: namedtuple('X', d.keys())(*d.values())
+)
+
+ADB_GET_COMMANDS = {
+ "apn_mode": "settings get global airplane_mode_on",
+ "lte_state": "dumpsys telephony.registry | grep -m1 mDataConnectionState",
+ "ping_result": "ping -c 3 8.8.8.8&>/dev/null; echo $?"
+}
+ADB_APN_COMMANDS = {
+ "home": "input keyevent 3",
+ "setting": "am start -a android.settings.AIRPLANE_MODE_SETTINGS",
+ "toggle": "input tap " + \
+ CONF.adb.apn_mode_toggle_location.x + " " + \
+ CONF.adb.apn_mode_toggle_location.y
+}
+
+
+class State(enum.Enum):
+ error = "-1"
+ disconnected = "0"
+ connecting = "1"
+ connected = "2"
+
+ @classmethod
+ def has_value(cls, value):
+ return value in cls._value2member_map_
+
+
+edge_status = {
+ 'name': CONF.edge_name,
+ 'status': {
+ 'control_plane': None,
+ 'user_plane': 'connected'
+ }
+}
+
+
+def _run_adb_shell(adb, command):
+ result = adb.shell_command(command)
+ if adb.lastFailed():
+ err = "[ERROR]: " + command + " failed"
+ return False, err
+ time.sleep(2)
+ result = result[0] if result is not None else None
+ return True, result
+
+
+def get_control_plane_state():
+ '''
+ check aether control plane works by toggling airplane mode
+ '''
+ adb = ADB()
+ if adb.set_adb_path(CONF.adb.path) is False:
+ err = "[ERROR]: " + CONF.adb.path + " not found"
+ return State.error, err
+
+ # get the current airplane mode
+ success, result = _run_adb_shell(adb, ADB_GET_COMMANDS['apn_mode'])
+ if not success or result is None:
+ return State.error, result
+ apn_mode_on = True if result == "1" else False
+
+ # toggle the airplane mode
+ for command in ADB_APN_COMMANDS.values():
+ success, result = _run_adb_shell(adb, command)
+ if not success:
+ return State.error, result
+ if not apn_mode_on:
+ success, result = _run_adb_shell(adb, ADB_APN_COMMANDS['toggle'])
+ if not success:
+ return State.error, result
+
+ # additional wait for UE to fully attach
+ time.sleep(3)
+
+ # get connection state
+ state = State.connecting.value
+ while state == State.connecting.value:
+ success, result = _run_adb_shell(adb, ADB_GET_COMMANDS['lte_state'])
+ if not success or result is None:
+ return State.error, result
+ state = result.split("=")[1]
+
+ if not State.has_value(state):
+ return State.error, None
+ return State(state), None
+
+
+def get_user_plane_state():
+ '''
+ checks aether user plane connectivity with ping to 8.8.8.8
+ '''
+ adb = ADB()
+ if adb.set_adb_path(CONF.adb.path) is False:
+ err = "[ERROR]: " + CONF.adb.path + " not found"
+ return State.error, err
+
+ success, result = _run_adb_shell(adb, ADB_GET_COMMANDS['ping_result'])
+ if not success or result is None:
+ return State.error, result
+
+ state = State.connected if result == "0" else State.disconnected
+ return state, None
+
+
+def report_aether_network_state():
+ '''
+ report the aether network state to the monitoring server
+ '''
+ response = requests.post(CONF.report_url, json=edge_status)
+ return requests.codes.ok,
+ if response == requests.codes.ok:
+ print("[INFO]: reported the status")
+ else:
+ response.raise_for_status()
+
+
+def run():
+ while True:
+ cp_state, err = get_control_plane_state()
+ up_state, err = get_user_plane_state()
+
+ edge_status['status']['control_plane'] = cp_state.name
+ edge_status['status']['user_plane'] = up_state.name
+
+ report_aether_network_state()
+ time.sleep(600)
+
+
+def main():
+ with daemon.DaemonContext():
+ run()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/edge-monitoring/edge_monitoring_server.py b/edge-monitoring/edge_monitoring_server.py
new file mode 100755
index 0000000..ed2088c
--- /dev/null
+++ b/edge-monitoring/edge_monitoring_server.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+# Copyright 2020-present Open Networking Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from flask import Flask, jsonify, abort, request
+
+app = Flask(__name__)
+edges = [
+ {
+ 'name': 'production-edge-example',
+ 'status': {
+ 'control_plane': 'connected',
+ 'user_plane': 'connected'
+ },
+ 'last_update': time.time()
+ }
+]
+
+
+@app.route('/edges/healthz', methods=['GET'])
+def get_health():
+ return {'message': 'healthy'}
+
+
+@app.route('/edges', methods=['GET'])
+def get_edges():
+ return jsonify({'edges': edges})
+
+
+@app.route('/edges/<string:name>', methods=['GET'])
+def get_edge(name):
+ edge = [edge for edge in edges if edge['name'] == name]
+ if len(edge) == 0:
+ abort(404)
+ return jsonify({'edge': edge[0]})
+
+
+@app.route('/edges', methods=['POST'])
+def create_or_update_edge():
+ if not request.json:
+ abort(400)
+ if 'name' not in request.json:
+ abort(400)
+ if 'status' not in request.json:
+ abort(400)
+
+ req_edge = {
+ 'name': request.json['name'],
+ 'status': {
+ 'control_plane': request.json['status']['control_plane'],
+ 'user_plane': request.json['status']['user_plane']
+ },
+ 'last_update': time.time()
+ }
+
+ edge = [edge for edge in edges if edge['name'] == req_edge['name']]
+ if len(edge) == 0:
+ print("new edge request " + req_edge['name'])
+ edges.append(req_edge)
+ else:
+ edge[0]['status']['control_plane'] = req_edge['status']['control_plane']
+ edge[0]['status']['user_plane'] = req_edge['status']['user_plane']
+ edge[0]['last_update'] = req_edge['last_update']
+
+ return jsonify({'edge': req_edge}), 201
+
+
+if __name__ == '__main__':
+ app.run(debug=True, host='0.0.0.0', port=80)
diff --git a/edge-monitoring/edge_monitoring_server_k8s.yaml b/edge-monitoring/edge_monitoring_server_k8s.yaml
new file mode 100644
index 0000000..49a21e2
--- /dev/null
+++ b/edge-monitoring/edge_monitoring_server_k8s.yaml
@@ -0,0 +1,67 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: edge-monitoring
+spec:
+ finalizers:
+ - kubernetes
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: edge-monitoring-server
+ labels:
+ app: edge-monitoring-server
+ namespace: edge-monitoring
+spec:
+ selector:
+ matchLabels:
+ app: edge-monitoring-server
+ replicas: 1
+ strategy:
+ type: RollingUpdate
+ template:
+ metadata:
+ labels:
+ app: edge-monitoring-server
+ spec:
+ containers:
+ - name: server
+ image: docker.io/omecproject/edge-monitoring-server:0.1.0
+ imagePullPolicy: Always
+ command: ["python", "edge_monitoring_server.py"]
+ livenessProbe:
+ httpGet:
+ path: /edges/healthz
+ port: 80
+ initialDelaySeconds: 3
+ periodSeconds: 3
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: edge-monitoring-server
+ namespace: edge-monitoring
+spec:
+ selector:
+ app: edge-monitoring-server
+ ports:
+ - port: 80
+ targetPort: 80
+ protocol: TCP
+ name: server
+---
+apiVersion: extensions/v1beta1
+kind: Ingress
+metadata:
+ name: edge-monitoring-server
+ namespace: edge-monitoring
+spec:
+ rules:
+ - host: aether.onlab.us
+ http:
+ paths:
+ - backend:
+ serviceName: edge-monitoring-server
+ servicePort: 80
+ path: /edges
\ No newline at end of file
diff --git a/edge-monitoring/requirements.txt b/edge-monitoring/requirements.txt
new file mode 100644
index 0000000..0410a74
--- /dev/null
+++ b/edge-monitoring/requirements.txt
@@ -0,0 +1,4 @@
+flask
+requests
+git+git://github.com/sch3m4/pyadb@master#egg=pyadb
+python-daemon
diff --git a/slack-notifier/aether_status_notifier.py b/slack-notifier/aether_status_notifier.py
new file mode 100644
index 0000000..4cd3aa2
--- /dev/null
+++ b/slack-notifier/aether_status_notifier.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python
+
+# Copyright 2020-present Open Networking Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import json
+import enum
+import time
+import datetime
+import requests
+from pytz import timezone
+from datetime import date
+from collections import namedtuple
+from rancher import Client as RancherClient
+from slack import WebClient as SlackClient
+from slack.errors import SlackApiError
+
+
+class Status(enum.Enum):
+ healthy = 1
+ unhealthy = 2
+ error = 3 # check failed
+ unavailable = 4 # check not automated
+
+
+CONF = json.loads(
+ open(os.getenv('CONFIG_FILE', "./config.json")).read(),
+ object_hook=lambda d: namedtuple('X', d.keys())(*d.values())
+)
+
+EMOJI = {
+ Status.healthy: CONF.slack.emoji.healthy,
+ Status.unhealthy: CONF.slack.emoji.unhealthy,
+ Status.error: CONF.slack.emoji.error,
+ Status.unavailable: CONF.slack.emoji.unavailable
+}
+
+HEALTHY_STATES = ["active", "connected"]
+
+REPORT_HEADER = '''\
+*******************************************************************************
+'''
+REPORT_HEADER += CONF.slack.report_header
+REPORT_HEADER += date.today().strftime(" %d/%m/%Y\n")
+REPORT_HEADER += EMOJI[Status.healthy] + " Healthy "
+REPORT_HEADER += EMOJI[Status.unhealthy] + " Unhealthy "
+REPORT_HEADER += EMOJI[Status.error] + " Checks failed "
+REPORT_HEADER += EMOJI[Status.unavailable] + " Checks not automated\n"
+REPORT_HEADER += '''\
+*******************************************************************************
+'''
+
+MSG_PREFIX_BLANK = " :warning-sign: "
+MSG_PREFIX_ERROR = " [ERROR]: "
+
+
+def _get_status(state_string):
+ if state_string in HEALTHY_STATES:
+ return Status.healthy
+ else:
+ return Status.unhealthy
+
+
+def _get_report(name, status, msg=[]):
+ report = []
+ if not status:
+ return []
+ report += [EMOJI[status] + " " + name]
+ msg_prefix = MSG_PREFIX_ERROR if status == Status.error else MSG_PREFIX_BLANK
+ for m in msg:
+ report += [msg_prefix + m]
+ return report
+
+
+def _run_adb_command(adb_path, ssh_client, command):
+ final_command = adb_path + " shell " + command
+ stdin, stdout, stderr = ssh_client.exec_command(final_command)
+ error = stderr.read().decode("ascii").strip("\n")
+ if "Error" in error:
+ msg = "failed to run command: " + final_command
+ return False, msg
+ output = stdout.read().decode("ascii").strip("\n")
+ time.sleep(2)
+ return True, output
+
+
+def get_project_status(cluster, project):
+ status = ""
+ msg = []
+
+ projects = cluster.projects(name=project.name)
+ check_type = project.check_type
+ unhealthy = []
+ for p in projects:
+ targets = p.apps() if check_type == "app" else p.workloads()
+ for target in targets.data:
+ status = _get_status(target.state)
+ if status == Status.unhealthy:
+ warning = check_type + " " + target.name + " is unhealthy"
+ unhealthy += [warning]
+ if status == Status.unhealthy:
+ msg += unhealthy
+
+ return status, msg
+
+
+def get_aether_network_status(edge_name):
+ status = ""
+ msg = []
+
+ try:
+ req_url = CONF.edge_monitoring.api_url + "/" + edge_name
+ response = requests.get(req_url)
+
+ if response.status_code == 404:
+ return Status.unavailable, []
+ response.raise_for_status()
+ except Exception as e:
+ return Status.error, str(e)
+
+ json_resp = json.loads(response.text)['edge']
+
+ last_update = datetime.datetime.fromtimestamp(json_resp['last_update'])
+ time_diff = datetime.datetime.now() - last_update
+ time_diff_mins = int(round(time_diff.total_seconds() / 60))
+ if time_diff_mins > 10:
+ msg += ['status report not received for ' + str(time_diff_mins) + 'min']
+ return Status.error, msg
+
+ status = Status.healthy
+
+ cp_status = _get_status(json_resp['status']['control_plane'])
+ if cp_status is not Status.healthy:
+ status = Status.unhealthy
+ msg += ["control plane is not healthy"]
+
+ up_status = _get_status(json_resp['status']['user_plane'])
+ if up_status is not Status.healthy:
+ status = Status.unhealthy
+ msg += ["user plane is not healthy"]
+
+ return status, msg
+
+
+def get_k8s_status(cluster):
+ status = ""
+ msg = []
+
+ status = _get_status(cluster.state)
+ if cluster.state == "unavailable" or status is Status.unhealthy:
+ msg += [cluster.transitioningMessage]
+ return Status.error, msg
+
+ for component in cluster.componentStatuses:
+ for condition in component.conditions:
+ if condition.status != "True":
+ msg += [component.name + " is unhealthy"]
+ for node in cluster.nodes():
+ if _get_status(node.state) != Status.healthy:
+ msg += [node.hostname + " is unhealthy"]
+
+ return status, msg
+
+
+def get_cluster_health_report(cluster, edge=False):
+ status = ""
+ msg = []
+ report = ["*[" + cluster.name + "]*"]
+
+ # Check K8S API health
+ k8s_status, msg = get_k8s_status(cluster)
+ report += _get_report("Kubernetes", k8s_status, msg)
+
+ # Check managed project health
+ for project in CONF.managed_projects:
+ status = ""
+ msg = []
+ if k8s_status == Status.healthy:
+ status, msg = get_project_status(cluster, project)
+ else:
+ status = Status.error
+ report += _get_report(project.display, status, msg)
+
+ # Check Aether network health for Edges
+ # TODO: separate report for control plane and user plane
+ if edge:
+ status, msg = get_aether_network_status(cluster.name)
+ report += _get_report("Aether Network", status, msg)
+
+ report_string = "\n".join(report)
+ return report_string
+
+
+def main():
+ report = REPORT_HEADER
+
+ # Get cluster status from Rancher
+ try:
+ rancher_client = RancherClient(
+ url=CONF.rancher.api_url, access_key=CONF.rancher.access_key,
+ secret_key=CONF.rancher.secret_key)
+ response = rancher_client.list_cluster()
+ except Exception as e:
+ report += _get_report("Rancher", Status.error, [str(e)])
+ response = namedtuple('X', "data")([])
+
+ # Check cluster health and make a report
+ for cluster in response.data:
+ if "production" in cluster.name:
+ edge = True if "edge" in cluster.name else False
+ report += get_cluster_health_report(cluster, edge)
+ report += "\n\n"
+
+ # Publish the report to Slack channel
+ try:
+ slack_client = SlackClient(token=CONF.slack.api_token)
+ response = slack_client.chat_postMessage(
+ channel=CONF.slack.channel,
+ text=report)
+ except SlackApiError as e:
+ assert e.response["ok"] is False
+ assert e.response["error"]
+ print(f"Got an error: {e.response['error']}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/slack-notifier/config.json b/slack-notifier/config.json
new file mode 100644
index 0000000..c19c338
--- /dev/null
+++ b/slack-notifier/config.json
@@ -0,0 +1,38 @@
+{
+ "rancher": {
+ "api_url": "https://aether.onlab.us/v3",
+ "access_key": "token-ctvfp",
+ "secret_key": "4wkn4n6jdvbg2g2tddmjpvb7tdhw5dbsbqkhzj5fkjv9nl8bpvl972"
+ },
+ "edge_monitoring": {
+ "api_url": "https://aether.onlab.us/edges"
+ },
+ "slack": {
+ "api_token": "xoxb-195991835223-1111472166775-NDjfMUTM93LZ4KvcF3yoLGdy",
+ "channel": "#aether-status",
+ "emoji": {
+ "healthy": ":green_circle:",
+ "unhealthy": ":red_circle:",
+ "error": ":yellow_circle:",
+ "unavailable": ":white_circle:"
+ },
+ "report_header": "Aether Daily Status Report"
+ },
+ "managed_projects": [
+ {
+ "name": "System",
+ "check_type": "pod",
+ "display": "Kubernetes System Pods"
+ },
+ {
+ "name": "ConnectivityService",
+ "check_type": "app",
+ "display": "Connectivity Service Apps"
+ },
+ {
+ "name": "CordPlatform",
+ "check_type": "app",
+ "display": "Monitoring Service Apps"
+ }
+ ]
+}
diff --git a/slack-notifier/requirements.txt b/slack-notifier/requirements.txt
new file mode 100644
index 0000000..b00a8bf
--- /dev/null
+++ b/slack-notifier/requirements.txt
@@ -0,0 +1,4 @@
+slackclient
+requests
+git+git://github.com/rancher/client-python@master#egg=client-python
+pytz