AETHER-8 Add a script for daily status report to Slack channel

Change-Id: Ib5e7102d968961e1ca071eaf8a006cc67a4d5c9b
diff --git a/edge-monitoring/Dockerfile.server b/edge-monitoring/Dockerfile.server
new file mode 100644
index 0000000..3648f2b
--- /dev/null
+++ b/edge-monitoring/Dockerfile.server
@@ -0,0 +1,22 @@
+# Copyright 2020-present Open Networking Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+FROM python:3.7-slim
+
+WORKDIR /usr/src/app
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+COPY edge_monitoring_server.py ./
+
+CMD ["python", "edge_monitoring_server.py"]
diff --git a/edge-monitoring/config.json b/edge-monitoring/config.json
new file mode 100644
index 0000000..8f8a051
--- /dev/null
+++ b/edge-monitoring/config.json
@@ -0,0 +1,11 @@
+{
+    "edge_name": "production-edge-onf-menlo",
+    "adb": {
+        "path": "/usr/local/bin/adb",
+        "apn_mode_toggle_location": {
+            "x": "550",
+            "y": "700"
+        }
+    },
+    "report_url": "https://aether.onlab.us/edges"
+}
diff --git a/edge-monitoring/edge_monitoring_agent.py b/edge-monitoring/edge_monitoring_agent.py
new file mode 100644
index 0000000..bcfd5b5
--- /dev/null
+++ b/edge-monitoring/edge_monitoring_agent.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python
+
+# Copyright 2020-present Open Networking Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import os
+import time
+import requests
+import json
+import enum
+import daemon
+from collections import namedtuple
+from pyadb import ADB
+
+'''
+Check Aether network operational status and report it to
+central monitoring server
+
+1) check mobile connctivity after toggling the airplane mode
+2) check if ping to 8.8.8.8 works
+'''
+
+CONF = json.loads(
+    open(os.getenv('CONFIG_FILE', "./config.json")).read(),
+    object_hook=lambda d: namedtuple('X', d.keys())(*d.values())
+)
+
+ADB_GET_COMMANDS = {
+    "apn_mode": "settings get global airplane_mode_on",
+    "lte_state": "dumpsys telephony.registry | grep -m1 mDataConnectionState",
+    "ping_result": "ping -c 3 8.8.8.8&>/dev/null; echo $?"
+}
+ADB_APN_COMMANDS = {
+    "home": "input keyevent 3",
+    "setting": "am start -a android.settings.AIRPLANE_MODE_SETTINGS",
+    "toggle": "input tap " + \
+              CONF.adb.apn_mode_toggle_location.x + " " + \
+              CONF.adb.apn_mode_toggle_location.y
+}
+
+
+class State(enum.Enum):
+    error = "-1"
+    disconnected = "0"
+    connecting = "1"
+    connected = "2"
+
+    @classmethod
+    def has_value(cls, value):
+        return value in cls._value2member_map_
+
+
+edge_status = {
+    'name': CONF.edge_name,
+    'status': {
+        'control_plane': None,
+        'user_plane': 'connected'
+    }
+}
+
+
+def _run_adb_shell(adb, command):
+    result = adb.shell_command(command)
+    if adb.lastFailed():
+        err = "[ERROR]: " + command + " failed"
+        return False, err
+    time.sleep(2)
+    result = result[0] if result is not None else None
+    return True, result
+
+
+def get_control_plane_state():
+    '''
+    check aether control plane works by toggling airplane mode
+    '''
+    adb = ADB()
+    if adb.set_adb_path(CONF.adb.path) is False:
+        err = "[ERROR]: " + CONF.adb.path + " not found"
+        return State.error, err
+
+    # get the current airplane mode
+    success, result = _run_adb_shell(adb, ADB_GET_COMMANDS['apn_mode'])
+    if not success or result is None:
+        return State.error, result
+    apn_mode_on = True if result == "1" else False
+
+    # toggle the airplane mode
+    for command in ADB_APN_COMMANDS.values():
+        success, result = _run_adb_shell(adb, command)
+        if not success:
+            return State.error, result
+    if not apn_mode_on:
+        success, result = _run_adb_shell(adb, ADB_APN_COMMANDS['toggle'])
+        if not success:
+            return State.error, result
+
+    # additional wait for UE to fully attach
+    time.sleep(3)
+
+    # get connection state
+    state = State.connecting.value
+    while state == State.connecting.value:
+        success, result = _run_adb_shell(adb, ADB_GET_COMMANDS['lte_state'])
+        if not success or result is None:
+            return State.error, result
+        state = result.split("=")[1]
+
+    if not State.has_value(state):
+        return State.error, None
+    return State(state), None
+
+
+def get_user_plane_state():
+    '''
+    checks aether user plane connectivity with ping to 8.8.8.8
+    '''
+    adb = ADB()
+    if adb.set_adb_path(CONF.adb.path) is False:
+        err = "[ERROR]: " + CONF.adb.path + " not found"
+        return State.error, err
+
+    success, result = _run_adb_shell(adb, ADB_GET_COMMANDS['ping_result'])
+    if not success or result is None:
+        return State.error, result
+
+    state = State.connected if result == "0" else State.disconnected
+    return state, None
+
+
+def report_aether_network_state():
+    '''
+    report the aether network state to the monitoring server
+    '''
+    response = requests.post(CONF.report_url, json=edge_status)
+    return requests.codes.ok,
+    if response == requests.codes.ok:
+        print("[INFO]: reported the status")
+    else:
+        response.raise_for_status()
+
+
+def run():
+    while True:
+        cp_state, err = get_control_plane_state()
+        up_state, err = get_user_plane_state()
+
+        edge_status['status']['control_plane'] = cp_state.name
+        edge_status['status']['user_plane'] = up_state.name
+
+        report_aether_network_state()
+        time.sleep(600)
+
+
+def main():
+    with daemon.DaemonContext():
+        run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/edge-monitoring/edge_monitoring_server.py b/edge-monitoring/edge_monitoring_server.py
new file mode 100755
index 0000000..ed2088c
--- /dev/null
+++ b/edge-monitoring/edge_monitoring_server.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+# Copyright 2020-present Open Networking Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from flask import Flask, jsonify, abort, request
+
+app = Flask(__name__)
+edges = [
+    {
+        'name': 'production-edge-example',
+        'status': {
+            'control_plane': 'connected',
+            'user_plane': 'connected'
+        },
+        'last_update': time.time()
+    }
+]
+
+
+@app.route('/edges/healthz', methods=['GET'])
+def get_health():
+    return {'message': 'healthy'}
+
+
+@app.route('/edges', methods=['GET'])
+def get_edges():
+    return jsonify({'edges': edges})
+
+
+@app.route('/edges/<string:name>', methods=['GET'])
+def get_edge(name):
+    edge = [edge for edge in edges if edge['name'] == name]
+    if len(edge) == 0:
+        abort(404)
+    return jsonify({'edge': edge[0]})
+
+
+@app.route('/edges', methods=['POST'])
+def create_or_update_edge():
+    if not request.json:
+        abort(400)
+    if 'name' not in request.json:
+        abort(400)
+    if 'status' not in request.json:
+        abort(400)
+
+    req_edge = {
+        'name': request.json['name'],
+        'status': {
+            'control_plane': request.json['status']['control_plane'],
+            'user_plane': request.json['status']['user_plane']
+        },
+        'last_update': time.time()
+    }
+
+    edge = [edge for edge in edges if edge['name'] == req_edge['name']]
+    if len(edge) == 0:
+        print("new edge request " + req_edge['name'])
+        edges.append(req_edge)
+    else:
+        edge[0]['status']['control_plane'] = req_edge['status']['control_plane']
+        edge[0]['status']['user_plane'] = req_edge['status']['user_plane']
+        edge[0]['last_update'] = req_edge['last_update']
+
+    return jsonify({'edge': req_edge}), 201
+
+
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port=80)
diff --git a/edge-monitoring/edge_monitoring_server_k8s.yaml b/edge-monitoring/edge_monitoring_server_k8s.yaml
new file mode 100644
index 0000000..49a21e2
--- /dev/null
+++ b/edge-monitoring/edge_monitoring_server_k8s.yaml
@@ -0,0 +1,67 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: edge-monitoring
+spec:
+  finalizers:
+  - kubernetes
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: edge-monitoring-server
+  labels:
+    app: edge-monitoring-server
+  namespace: edge-monitoring
+spec:
+  selector:
+    matchLabels:
+      app: edge-monitoring-server
+  replicas: 1
+  strategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        app: edge-monitoring-server
+    spec:
+      containers:
+      - name: server
+        image: docker.io/omecproject/edge-monitoring-server:0.1.0
+        imagePullPolicy: Always
+        command: ["python", "edge_monitoring_server.py"]
+        livenessProbe:
+          httpGet:
+            path: /edges/healthz
+            port: 80
+          initialDelaySeconds: 3
+          periodSeconds: 3
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: edge-monitoring-server
+  namespace: edge-monitoring
+spec:
+  selector:
+    app: edge-monitoring-server
+  ports:
+  - port: 80
+    targetPort: 80
+    protocol: TCP
+    name: server
+---
+apiVersion: extensions/v1beta1
+kind: Ingress
+metadata:
+  name: edge-monitoring-server
+  namespace: edge-monitoring
+spec:
+  rules:
+  - host: aether.onlab.us
+    http:
+      paths:
+      - backend:
+          serviceName: edge-monitoring-server
+          servicePort: 80
+        path: /edges
\ No newline at end of file
diff --git a/edge-monitoring/requirements.txt b/edge-monitoring/requirements.txt
new file mode 100644
index 0000000..0410a74
--- /dev/null
+++ b/edge-monitoring/requirements.txt
@@ -0,0 +1,4 @@
+flask
+requests
+git+git://github.com/sch3m4/pyadb@master#egg=pyadb
+python-daemon
diff --git a/slack-notifier/aether_status_notifier.py b/slack-notifier/aether_status_notifier.py
new file mode 100644
index 0000000..4cd3aa2
--- /dev/null
+++ b/slack-notifier/aether_status_notifier.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python
+
+# Copyright 2020-present Open Networking Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import json
+import enum
+import time
+import datetime
+import requests
+from pytz import timezone
+from datetime import date
+from collections import namedtuple
+from rancher import Client as RancherClient
+from slack import WebClient as SlackClient
+from slack.errors import SlackApiError
+
+
+class Status(enum.Enum):
+    healthy = 1
+    unhealthy = 2
+    error = 3  # check failed
+    unavailable = 4  # check not automated
+
+
+CONF = json.loads(
+    open(os.getenv('CONFIG_FILE', "./config.json")).read(),
+    object_hook=lambda d: namedtuple('X', d.keys())(*d.values())
+)
+
+EMOJI = {
+    Status.healthy: CONF.slack.emoji.healthy,
+    Status.unhealthy: CONF.slack.emoji.unhealthy,
+    Status.error: CONF.slack.emoji.error,
+    Status.unavailable: CONF.slack.emoji.unavailable
+}
+
+HEALTHY_STATES = ["active", "connected"]
+
+REPORT_HEADER = '''\
+*******************************************************************************
+'''
+REPORT_HEADER += CONF.slack.report_header
+REPORT_HEADER += date.today().strftime(" %d/%m/%Y\n")
+REPORT_HEADER += EMOJI[Status.healthy] + " Healthy   "
+REPORT_HEADER += EMOJI[Status.unhealthy] + " Unhealthy   "
+REPORT_HEADER += EMOJI[Status.error] + " Checks failed   "
+REPORT_HEADER += EMOJI[Status.unavailable] + " Checks not automated\n"
+REPORT_HEADER += '''\
+*******************************************************************************
+'''
+
+MSG_PREFIX_BLANK = "      :warning-sign: "
+MSG_PREFIX_ERROR = "       [ERROR]: "
+
+
+def _get_status(state_string):
+    if state_string in HEALTHY_STATES:
+        return Status.healthy
+    else:
+        return Status.unhealthy
+
+
+def _get_report(name, status, msg=[]):
+    report = []
+    if not status:
+        return []
+    report += [EMOJI[status] + " " + name]
+    msg_prefix = MSG_PREFIX_ERROR if status == Status.error else MSG_PREFIX_BLANK
+    for m in msg:
+        report += [msg_prefix + m]
+    return report
+
+
+def _run_adb_command(adb_path, ssh_client, command):
+    final_command = adb_path + " shell " + command
+    stdin, stdout, stderr = ssh_client.exec_command(final_command)
+    error = stderr.read().decode("ascii").strip("\n")
+    if "Error" in error:
+        msg = "failed to run command: " + final_command
+        return False, msg
+    output = stdout.read().decode("ascii").strip("\n")
+    time.sleep(2)
+    return True, output
+
+
+def get_project_status(cluster, project):
+    status = ""
+    msg = []
+
+    projects = cluster.projects(name=project.name)
+    check_type = project.check_type
+    unhealthy = []
+    for p in projects:
+        targets = p.apps() if check_type == "app" else p.workloads()
+        for target in targets.data:
+            status = _get_status(target.state)
+            if status == Status.unhealthy:
+                warning = check_type + " " + target.name + " is unhealthy"
+                unhealthy += [warning]
+    if status == Status.unhealthy:
+        msg += unhealthy
+
+    return status, msg
+
+
+def get_aether_network_status(edge_name):
+    status = ""
+    msg = []
+
+    try:
+        req_url = CONF.edge_monitoring.api_url + "/" + edge_name
+        response = requests.get(req_url)
+
+        if response.status_code == 404:
+            return Status.unavailable, []
+        response.raise_for_status()
+    except Exception as e:
+        return Status.error, str(e)
+
+    json_resp = json.loads(response.text)['edge']
+
+    last_update = datetime.datetime.fromtimestamp(json_resp['last_update'])
+    time_diff = datetime.datetime.now() - last_update
+    time_diff_mins = int(round(time_diff.total_seconds() / 60))
+    if time_diff_mins > 10:
+        msg += ['status report not received for ' + str(time_diff_mins) + 'min']
+        return Status.error, msg
+
+    status = Status.healthy
+
+    cp_status = _get_status(json_resp['status']['control_plane'])
+    if cp_status is not Status.healthy:
+        status = Status.unhealthy
+        msg += ["control plane is not healthy"]
+
+    up_status = _get_status(json_resp['status']['user_plane'])
+    if up_status is not Status.healthy:
+        status = Status.unhealthy
+        msg += ["user plane is not healthy"]
+
+    return status, msg
+
+
+def get_k8s_status(cluster):
+    status = ""
+    msg = []
+
+    status = _get_status(cluster.state)
+    if cluster.state == "unavailable" or status is Status.unhealthy:
+        msg += [cluster.transitioningMessage]
+        return Status.error, msg
+
+    for component in cluster.componentStatuses:
+        for condition in component.conditions:
+            if condition.status != "True":
+                msg += [component.name + " is unhealthy"]
+    for node in cluster.nodes():
+        if _get_status(node.state) != Status.healthy:
+            msg += [node.hostname + " is unhealthy"]
+
+    return status, msg
+
+
+def get_cluster_health_report(cluster, edge=False):
+    status = ""
+    msg = []
+    report = ["*[" + cluster.name + "]*"]
+
+    # Check K8S API health
+    k8s_status, msg = get_k8s_status(cluster)
+    report += _get_report("Kubernetes", k8s_status, msg)
+
+    # Check managed project health
+    for project in CONF.managed_projects:
+        status = ""
+        msg = []
+        if k8s_status == Status.healthy:
+            status, msg = get_project_status(cluster, project)
+        else:
+            status = Status.error
+        report += _get_report(project.display, status, msg)
+
+    # Check Aether network health for Edges
+    # TODO: separate report for control plane and user plane
+    if edge:
+        status, msg = get_aether_network_status(cluster.name)
+        report += _get_report("Aether Network", status, msg)
+
+    report_string = "\n".join(report)
+    return report_string
+
+
+def main():
+    report = REPORT_HEADER
+
+    # Get cluster status from Rancher
+    try:
+        rancher_client = RancherClient(
+            url=CONF.rancher.api_url, access_key=CONF.rancher.access_key,
+            secret_key=CONF.rancher.secret_key)
+        response = rancher_client.list_cluster()
+    except Exception as e:
+        report += _get_report("Rancher", Status.error, [str(e)])
+        response = namedtuple('X', "data")([])
+
+    # Check cluster health and make a report
+    for cluster in response.data:
+        if "production" in cluster.name:
+            edge = True if "edge" in cluster.name else False
+            report += get_cluster_health_report(cluster, edge)
+            report += "\n\n"
+
+    # Publish the report to Slack channel
+    try:
+        slack_client = SlackClient(token=CONF.slack.api_token)
+        response = slack_client.chat_postMessage(
+            channel=CONF.slack.channel,
+            text=report)
+    except SlackApiError as e:
+        assert e.response["ok"] is False
+        assert e.response["error"]
+        print(f"Got an error: {e.response['error']}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/slack-notifier/config.json b/slack-notifier/config.json
new file mode 100644
index 0000000..c19c338
--- /dev/null
+++ b/slack-notifier/config.json
@@ -0,0 +1,38 @@
+{
+    "rancher": {
+        "api_url": "https://aether.onlab.us/v3",
+        "access_key": "token-ctvfp",
+        "secret_key": "4wkn4n6jdvbg2g2tddmjpvb7tdhw5dbsbqkhzj5fkjv9nl8bpvl972"
+    },
+    "edge_monitoring": {
+        "api_url": "https://aether.onlab.us/edges"
+    },
+    "slack": {
+        "api_token": "xoxb-195991835223-1111472166775-NDjfMUTM93LZ4KvcF3yoLGdy",
+        "channel": "#aether-status",
+        "emoji": {
+            "healthy": ":green_circle:",
+            "unhealthy": ":red_circle:",
+            "error": ":yellow_circle:",
+            "unavailable": ":white_circle:"
+        },
+        "report_header": "Aether Daily Status Report"
+    },
+    "managed_projects": [
+        {
+            "name": "System",
+            "check_type": "pod",
+            "display": "Kubernetes System Pods"
+        },
+        {
+            "name": "ConnectivityService",
+            "check_type": "app",
+            "display": "Connectivity Service Apps"
+        },
+        {
+            "name": "CordPlatform",
+            "check_type": "app",
+            "display": "Monitoring Service Apps"
+        }
+    ]
+}
diff --git a/slack-notifier/requirements.txt b/slack-notifier/requirements.txt
new file mode 100644
index 0000000..b00a8bf
--- /dev/null
+++ b/slack-notifier/requirements.txt
@@ -0,0 +1,4 @@
+slackclient
+requests
+git+git://github.com/rancher/client-python@master#egg=client-python
+pytz