Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | |
| 3 | # Copyright 2020-present Open Networking Foundation |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | # you may not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 17 | import os |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 18 | import time |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 19 | import datetime |
| 20 | import pytz |
| 21 | import threading |
| 22 | from icalevents.icalevents import events |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 23 | from flask import Flask, jsonify, abort, request, Response |
| 24 | import prometheus_client as prom |
| 25 | |
Andy Bavier | 8a5c987 | 2020-10-21 13:17:53 -0700 | [diff] [blame] | 26 | # URL of maintenance calendar |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 27 | SECRET_ICAL_URL = os.environ.get("SECRET_ICAL_URL") |
Andy Bavier | 8a5c987 | 2020-10-21 13:17:53 -0700 | [diff] [blame] | 28 | |
| 29 | # Aether environment that the server is monitoring (e.g., "production") |
| 30 | # To schedule downtime, postfix the cluster name with the env: "ace-tucson-production" |
| 31 | AETHER_ENV = os.environ.get("AETHER_ENV", "production") |
| 32 | |
| 33 | # Move to "no result" status if we don't hear from agent for this many seconds |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 34 | NO_RESULT_THRESHOLD = 720 |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 35 | |
| 36 | app = Flask(__name__) |
| 37 | edges = [ |
| 38 | { |
Andy Bavier | 8a5c987 | 2020-10-21 13:17:53 -0700 | [diff] [blame] | 39 | 'name': 'ace-example', |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 40 | 'status': { |
| 41 | 'control_plane': 'connected', |
| 42 | 'user_plane': 'connected' |
| 43 | }, |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 44 | 'last_update': time.time(), |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 45 | } |
| 46 | ] |
| 47 | |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 48 | status_codes = { |
| 49 | "no result": -2, |
| 50 | "error": -1, |
| 51 | "disconnected": 0, |
| 52 | "connecting": 1, |
| 53 | "connected": 2 |
| 54 | } |
| 55 | |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 56 | room_mapping = { |
Andy Bavier | 0423cbd | 2020-10-23 10:50:29 -0700 | [diff] [blame] | 57 | "ace-menlo-pixel-production": "(Compute)-MP-1-Aether Production", |
| 58 | "ace-menlo-staging": "(Compute)-MP-1-Aether Staging" |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 59 | } |
| 60 | |
Andy Bavier | 5b4e28f | 2021-03-09 15:48:20 -0700 | [diff] [blame] | 61 | # Legacy test status metrics, reporting a status code between -2 and 2 |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 62 | cp_status = prom.Gauge("aetheredge_status_control_plane", "Control plane status code", ["name"]) |
| 63 | up_status = prom.Gauge("aetheredge_status_user_plane", "User plane status code", ["name"]) |
Andy Bavier | 5b4e28f | 2021-03-09 15:48:20 -0700 | [diff] [blame] | 64 | |
| 65 | # Simplified binary test result metrics |
Andy Bavier | 3c7b78d | 2021-03-11 14:16:43 -0700 | [diff] [blame] | 66 | e2e_tests_ok = prom.Gauge("aetheredge_e2e_tests_ok", "Last connect and ping test both passed", ["name"]) |
Andy Bavier | a0c40aa | 2021-03-10 12:09:12 -0700 | [diff] [blame] | 67 | connect_test_ok = prom.Gauge("aetheredge_connect_test_ok", "Last connect test passed", ["name"]) |
Andy Bavier | a0c40aa | 2021-03-10 12:09:12 -0700 | [diff] [blame] | 68 | ping_test_ok = prom.Gauge("aetheredge_ping_test_ok", "Last ping test passed", ["name"]) |
Andy Bavier | 3c7b78d | 2021-03-11 14:16:43 -0700 | [diff] [blame] | 69 | e2e_tests_down = prom.Gauge("aetheredge_e2e_tests_down", "E2E tests not reporting", ["name"]) |
Andy Bavier | 5b4e28f | 2021-03-09 15:48:20 -0700 | [diff] [blame] | 70 | |
| 71 | # Other metrics |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 72 | last_update = prom.Gauge("aetheredge_last_update", "Last reported test result", ["name"]) |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 73 | maint_window = prom.Gauge("aetheredge_in_maintenance_window", "Currently in a maintenance window", ["name"]) |
| 74 | |
| 75 | def is_my_event(event, name): |
| 76 | for field in ["summary", "location", "description"]: |
Andy Bavier | 8a5c987 | 2020-10-21 13:17:53 -0700 | [diff] [blame] | 77 | fullname = name |
| 78 | if name.startswith("ace-"): |
| 79 | fullname = "%s-%s" % (name, AETHER_ENV) |
| 80 | if fullname in getattr(event, field, ""): |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 81 | return True |
Andy Bavier | 0423cbd | 2020-10-23 10:50:29 -0700 | [diff] [blame] | 82 | if fullname in room_mapping and room_mapping[fullname] in getattr(event, field, ""): |
| 83 | return True |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 84 | return False |
| 85 | |
Andy Bavier | c41cf0c | 2020-09-02 14:49:21 -0700 | [diff] [blame] | 86 | def is_naive_datetime(d): |
| 87 | return d.tzinfo is None or d.tzinfo.utcoffset(d) is None |
| 88 | |
| 89 | def process_all_day_events(es): |
| 90 | for event in es: |
| 91 | if event.all_day: |
| 92 | # All day events have naive datetimes, which breaks comparisons |
| 93 | pacific = pytz.timezone('US/Pacific') |
| 94 | if is_naive_datetime(event.start): |
| 95 | event.start = pacific.localize(event.start) |
| 96 | if is_naive_datetime(event.end): |
| 97 | event.end = pacific.localize(event.end) |
| 98 | |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 99 | def in_maintenance_window(events, name, now): |
| 100 | for event in events: |
| 101 | if event.start < now and event.end > now: |
| 102 | if is_my_event(event, name): |
| 103 | return True |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 104 | return False |
| 105 | |
| 106 | def pull_maintenance_events(): |
| 107 | while(True): |
| 108 | now = datetime.datetime.now(pytz.utc) |
| 109 | try: |
| 110 | es = events(SECRET_ICAL_URL, start = now) |
Andy Bavier | c41cf0c | 2020-09-02 14:49:21 -0700 | [diff] [blame] | 111 | process_all_day_events(es) |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 112 | except Exception as e: |
| 113 | print(e) |
| 114 | else: |
| 115 | for edge in edges: |
| 116 | if 'maintenance' not in edge: |
| 117 | edge['maintenance'] = {} |
| 118 | edge['maintenance']['in_window'] = in_maintenance_window(es, edge['name'], now) |
| 119 | edge['maintenance']['last_update'] = time.time() |
| 120 | time.sleep(60) |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 121 | |
| 122 | def time_out_stale_results(): |
| 123 | for edge in edges: |
| 124 | time_elapsed = time.time() - edge["last_update"] |
| 125 | if time_elapsed > NO_RESULT_THRESHOLD: |
| 126 | edge['status']['control_plane'] = "no result" |
| 127 | edge['status']['user_plane'] = "no result" |
| 128 | |
Andy Bavier | e47157d | 2020-12-11 14:13:12 -0700 | [diff] [blame] | 129 | def remove_edge_from_metrics(name): |
| 130 | try: |
| 131 | cp_status.remove(name) |
| 132 | up_status.remove(name) |
| 133 | last_update.remove(name) |
Andy Bavier | 3c7b78d | 2021-03-11 14:16:43 -0700 | [diff] [blame] | 134 | e2e_tests_ok.remove(name) |
Andy Bavier | a0c40aa | 2021-03-10 12:09:12 -0700 | [diff] [blame] | 135 | connect_test_ok.remove(name) |
Andy Bavier | a0c40aa | 2021-03-10 12:09:12 -0700 | [diff] [blame] | 136 | ping_test_ok.remove(name) |
Andy Bavier | 3c7b78d | 2021-03-11 14:16:43 -0700 | [diff] [blame] | 137 | e2e_tests_down.remove(name) |
Andy Bavier | 5b4e28f | 2021-03-09 15:48:20 -0700 | [diff] [blame] | 138 | except: |
| 139 | pass |
| 140 | |
| 141 | try: |
Andy Bavier | e47157d | 2020-12-11 14:13:12 -0700 | [diff] [blame] | 142 | maint_window.remove(name) |
| 143 | except: |
| 144 | pass |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 145 | |
| 146 | @app.route('/edges/metrics', methods=['GET']) |
| 147 | def get_prometheus_metrics(): |
| 148 | res = [] |
| 149 | time_out_stale_results() |
| 150 | for edge in edges: |
Andy Bavier | 8a5c987 | 2020-10-21 13:17:53 -0700 | [diff] [blame] | 151 | if edge['name'] == "ace-example": |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 152 | continue |
| 153 | |
Andy Bavier | 3c7b78d | 2021-03-11 14:16:43 -0700 | [diff] [blame] | 154 | connect_status = edge['status']['control_plane'] |
| 155 | ping_status = edge['status']['user_plane'] |
| 156 | |
| 157 | cp_status.labels(edge['name']).set(status_codes[connect_status]) |
| 158 | up_status.labels(edge['name']).set(status_codes[ping_status]) |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 159 | last_update.labels(edge['name']).set(edge['last_update']) |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 160 | if 'maintenance' in edge: |
| 161 | maint_window.labels(edge['name']).set(int(edge['maintenance']['in_window'])) |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 162 | |
Andy Bavier | a0c40aa | 2021-03-10 12:09:12 -0700 | [diff] [blame] | 163 | connect_test_ok.labels(edge['name']).set(0) |
Andy Bavier | a0c40aa | 2021-03-10 12:09:12 -0700 | [diff] [blame] | 164 | ping_test_ok.labels(edge['name']).set(0) |
Andy Bavier | 3c7b78d | 2021-03-11 14:16:43 -0700 | [diff] [blame] | 165 | e2e_tests_ok.labels(edge['name']).set(0) |
| 166 | e2e_tests_down.labels(edge['name']).set(0) |
| 167 | |
| 168 | if connect_status in ["error", "no result"] or ping_status in ["error", "no result"]: |
| 169 | e2e_tests_down.labels(edge['name']).set(1) |
| 170 | else: |
| 171 | if connect_status == "connected": |
| 172 | connect_test_ok.labels(edge['name']).set(1) |
| 173 | if ping_status == "connected": |
| 174 | ping_test_ok.labels(edge['name']).set(1) |
| 175 | if connect_status == "connected" and ping_status == "connected": |
| 176 | e2e_tests_ok.labels(edge['name']).set(1) |
Andy Bavier | 5b4e28f | 2021-03-09 15:48:20 -0700 | [diff] [blame] | 177 | |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 178 | res.append(prom.generate_latest(cp_status)) |
| 179 | res.append(prom.generate_latest(up_status)) |
| 180 | res.append(prom.generate_latest(last_update)) |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 181 | res.append(prom.generate_latest(maint_window)) |
Andy Bavier | a0c40aa | 2021-03-10 12:09:12 -0700 | [diff] [blame] | 182 | res.append(prom.generate_latest(connect_test_ok)) |
Andy Bavier | a0c40aa | 2021-03-10 12:09:12 -0700 | [diff] [blame] | 183 | res.append(prom.generate_latest(ping_test_ok)) |
Andy Bavier | 3c7b78d | 2021-03-11 14:16:43 -0700 | [diff] [blame] | 184 | res.append(prom.generate_latest(e2e_tests_ok)) |
| 185 | res.append(prom.generate_latest(e2e_tests_down)) |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 186 | |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 187 | return Response(res, mimetype="text/plain") |
| 188 | |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 189 | |
| 190 | @app.route('/edges/healthz', methods=['GET']) |
| 191 | def get_health(): |
| 192 | return {'message': 'healthy'} |
| 193 | |
| 194 | |
| 195 | @app.route('/edges', methods=['GET']) |
| 196 | def get_edges(): |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 197 | time_out_stale_results() |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 198 | return jsonify({'edges': edges}) |
| 199 | |
| 200 | |
| 201 | @app.route('/edges/<string:name>', methods=['GET']) |
| 202 | def get_edge(name): |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 203 | time_out_stale_results() |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 204 | edge = [edge for edge in edges if edge['name'] == name] |
| 205 | if len(edge) == 0: |
| 206 | abort(404) |
| 207 | return jsonify({'edge': edge[0]}) |
| 208 | |
| 209 | |
| 210 | @app.route('/edges', methods=['POST']) |
Andy Bavier | f872e9a | 2021-03-22 12:06:25 -0700 | [diff] [blame] | 211 | @app.route('/testresults', methods=['POST']) |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 212 | def create_or_update_edge(): |
| 213 | if not request.json: |
| 214 | abort(400) |
| 215 | if 'name' not in request.json: |
| 216 | abort(400) |
| 217 | if 'status' not in request.json: |
| 218 | abort(400) |
| 219 | |
| 220 | req_edge = { |
| 221 | 'name': request.json['name'], |
| 222 | 'status': { |
| 223 | 'control_plane': request.json['status']['control_plane'], |
| 224 | 'user_plane': request.json['status']['user_plane'] |
| 225 | }, |
| 226 | 'last_update': time.time() |
| 227 | } |
| 228 | |
| 229 | edge = [edge for edge in edges if edge['name'] == req_edge['name']] |
| 230 | if len(edge) == 0: |
| 231 | print("new edge request " + req_edge['name']) |
| 232 | edges.append(req_edge) |
| 233 | else: |
| 234 | edge[0]['status']['control_plane'] = req_edge['status']['control_plane'] |
| 235 | edge[0]['status']['user_plane'] = req_edge['status']['user_plane'] |
| 236 | edge[0]['last_update'] = req_edge['last_update'] |
| 237 | |
| 238 | return jsonify({'edge': req_edge}), 201 |
| 239 | |
| 240 | |
Hyunsun Moon | 5f237ec | 2020-09-29 14:45:52 -0700 | [diff] [blame] | 241 | @app.route('/edges/<string:name>', methods=['DELETE']) |
Andy Bavier | f872e9a | 2021-03-22 12:06:25 -0700 | [diff] [blame] | 242 | @app.route('/testresults/<string:name>', methods=['DELETE']) |
Hyunsun Moon | 5f237ec | 2020-09-29 14:45:52 -0700 | [diff] [blame] | 243 | def delete_edge(name): |
| 244 | print("delete edge request " + name) |
| 245 | result = False |
| 246 | for i in range(len(edges)): |
| 247 | if edges[i]['name'] == name: |
| 248 | del edges[i] |
Andy Bavier | e47157d | 2020-12-11 14:13:12 -0700 | [diff] [blame] | 249 | remove_edge_from_metrics(name) |
Hyunsun Moon | 5f237ec | 2020-09-29 14:45:52 -0700 | [diff] [blame] | 250 | result = True |
| 251 | break |
| 252 | if not result: |
| 253 | abort(404) |
| 254 | return jsonify({'result': True}) |
| 255 | |
| 256 | |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 257 | if __name__ == '__main__': |
Andy Bavier | 8a5c987 | 2020-10-21 13:17:53 -0700 | [diff] [blame] | 258 | if SECRET_ICAL_URL and AETHER_ENV: |
| 259 | print(" * Starting maintenance calendar polling thread (Aether env: %s)" % AETHER_ENV) |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 260 | t = threading.Thread(target=pull_maintenance_events) |
| 261 | t.start() |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 262 | app.run(debug=True, host='0.0.0.0', port=80) |