Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | |
| 3 | # Copyright 2020-present Open Networking Foundation |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | # you may not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 17 | import os |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 18 | import time |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 19 | import datetime |
| 20 | import pytz |
| 21 | import threading |
| 22 | from icalevents.icalevents import events |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 23 | from flask import Flask, jsonify, abort, request, Response |
| 24 | import prometheus_client as prom |
| 25 | |
Andy Bavier | 8a5c987 | 2020-10-21 13:17:53 -0700 | [diff] [blame] | 26 | # URL of maintenance calendar |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 27 | SECRET_ICAL_URL = os.environ.get("SECRET_ICAL_URL") |
Andy Bavier | 8a5c987 | 2020-10-21 13:17:53 -0700 | [diff] [blame] | 28 | |
| 29 | # Aether environment that the server is monitoring (e.g., "production") |
| 30 | # To schedule downtime, postfix the cluster name with the env: "ace-tucson-production" |
| 31 | AETHER_ENV = os.environ.get("AETHER_ENV", "production") |
| 32 | |
| 33 | # Move to "no result" status if we don't hear from agent for this many seconds |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 34 | NO_RESULT_THRESHOLD = 720 |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 35 | |
| 36 | app = Flask(__name__) |
| 37 | edges = [ |
| 38 | { |
Andy Bavier | 8a5c987 | 2020-10-21 13:17:53 -0700 | [diff] [blame] | 39 | 'name': 'ace-example', |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 40 | 'status': { |
| 41 | 'control_plane': 'connected', |
| 42 | 'user_plane': 'connected' |
| 43 | }, |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 44 | 'last_update': time.time(), |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 45 | } |
| 46 | ] |
| 47 | |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 48 | status_codes = { |
| 49 | "no result": -2, |
| 50 | "error": -1, |
| 51 | "disconnected": 0, |
| 52 | "connecting": 1, |
| 53 | "connected": 2 |
| 54 | } |
| 55 | |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 56 | room_mapping = { |
Andy Bavier | 0423cbd | 2020-10-23 10:50:29 -0700 | [diff] [blame] | 57 | "ace-menlo-pixel-production": "(Compute)-MP-1-Aether Production", |
| 58 | "ace-menlo-staging": "(Compute)-MP-1-Aether Staging" |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 59 | } |
| 60 | |
Andy Bavier | 5b4e28f | 2021-03-09 15:48:20 -0700 | [diff] [blame] | 61 | # Legacy test status metrics, reporting a status code between -2 and 2 |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 62 | cp_status = prom.Gauge("aetheredge_status_control_plane", "Control plane status code", ["name"]) |
| 63 | up_status = prom.Gauge("aetheredge_status_user_plane", "User plane status code", ["name"]) |
Andy Bavier | 5b4e28f | 2021-03-09 15:48:20 -0700 | [diff] [blame] | 64 | |
| 65 | # Simplified binary test result metrics |
Andy Bavier | a0c40aa | 2021-03-10 12:09:12 -0700 | [diff] [blame^] | 66 | connect_test_ok = prom.Gauge("aetheredge_connect_test_ok", "Last connect test passed", ["name"]) |
| 67 | connect_test_down = prom.Gauge("aetheredge_connect_test_down", "Connect test not reporting", ["name"]) |
| 68 | ping_test_ok = prom.Gauge("aetheredge_ping_test_ok", "Last ping test passed", ["name"]) |
| 69 | ping_test_down = prom.Gauge("aetheredge_ping_test_down", "Ping test not reporting", ["name"]) |
Andy Bavier | 5b4e28f | 2021-03-09 15:48:20 -0700 | [diff] [blame] | 70 | |
| 71 | # Other metrics |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 72 | last_update = prom.Gauge("aetheredge_last_update", "Last reported test result", ["name"]) |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 73 | maint_window = prom.Gauge("aetheredge_in_maintenance_window", "Currently in a maintenance window", ["name"]) |
| 74 | |
| 75 | def is_my_event(event, name): |
| 76 | for field in ["summary", "location", "description"]: |
Andy Bavier | 8a5c987 | 2020-10-21 13:17:53 -0700 | [diff] [blame] | 77 | fullname = name |
| 78 | if name.startswith("ace-"): |
| 79 | fullname = "%s-%s" % (name, AETHER_ENV) |
| 80 | if fullname in getattr(event, field, ""): |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 81 | return True |
Andy Bavier | 0423cbd | 2020-10-23 10:50:29 -0700 | [diff] [blame] | 82 | if fullname in room_mapping and room_mapping[fullname] in getattr(event, field, ""): |
| 83 | return True |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 84 | return False |
| 85 | |
Andy Bavier | c41cf0c | 2020-09-02 14:49:21 -0700 | [diff] [blame] | 86 | def is_naive_datetime(d): |
| 87 | return d.tzinfo is None or d.tzinfo.utcoffset(d) is None |
| 88 | |
| 89 | def process_all_day_events(es): |
| 90 | for event in es: |
| 91 | if event.all_day: |
| 92 | # All day events have naive datetimes, which breaks comparisons |
| 93 | pacific = pytz.timezone('US/Pacific') |
| 94 | if is_naive_datetime(event.start): |
| 95 | event.start = pacific.localize(event.start) |
| 96 | if is_naive_datetime(event.end): |
| 97 | event.end = pacific.localize(event.end) |
| 98 | |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 99 | def in_maintenance_window(events, name, now): |
| 100 | for event in events: |
| 101 | if event.start < now and event.end > now: |
| 102 | if is_my_event(event, name): |
| 103 | return True |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 104 | return False |
| 105 | |
| 106 | def pull_maintenance_events(): |
| 107 | while(True): |
| 108 | now = datetime.datetime.now(pytz.utc) |
| 109 | try: |
| 110 | es = events(SECRET_ICAL_URL, start = now) |
Andy Bavier | c41cf0c | 2020-09-02 14:49:21 -0700 | [diff] [blame] | 111 | process_all_day_events(es) |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 112 | except Exception as e: |
| 113 | print(e) |
| 114 | else: |
| 115 | for edge in edges: |
| 116 | if 'maintenance' not in edge: |
| 117 | edge['maintenance'] = {} |
| 118 | edge['maintenance']['in_window'] = in_maintenance_window(es, edge['name'], now) |
| 119 | edge['maintenance']['last_update'] = time.time() |
| 120 | time.sleep(60) |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 121 | |
| 122 | def time_out_stale_results(): |
| 123 | for edge in edges: |
| 124 | time_elapsed = time.time() - edge["last_update"] |
| 125 | if time_elapsed > NO_RESULT_THRESHOLD: |
| 126 | edge['status']['control_plane'] = "no result" |
| 127 | edge['status']['user_plane'] = "no result" |
| 128 | |
Andy Bavier | e47157d | 2020-12-11 14:13:12 -0700 | [diff] [blame] | 129 | def remove_edge_from_metrics(name): |
| 130 | try: |
| 131 | cp_status.remove(name) |
| 132 | up_status.remove(name) |
| 133 | last_update.remove(name) |
Andy Bavier | a0c40aa | 2021-03-10 12:09:12 -0700 | [diff] [blame^] | 134 | connect_test_ok.remove(name) |
| 135 | connect_test_down.remove(name) |
| 136 | ping_test_ok.remove(name) |
| 137 | ping_test_down.remove(name) |
Andy Bavier | 5b4e28f | 2021-03-09 15:48:20 -0700 | [diff] [blame] | 138 | except: |
| 139 | pass |
| 140 | |
| 141 | try: |
Andy Bavier | e47157d | 2020-12-11 14:13:12 -0700 | [diff] [blame] | 142 | maint_window.remove(name) |
| 143 | except: |
| 144 | pass |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 145 | |
| 146 | @app.route('/edges/metrics', methods=['GET']) |
| 147 | def get_prometheus_metrics(): |
| 148 | res = [] |
| 149 | time_out_stale_results() |
| 150 | for edge in edges: |
Andy Bavier | 8a5c987 | 2020-10-21 13:17:53 -0700 | [diff] [blame] | 151 | if edge['name'] == "ace-example": |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 152 | continue |
| 153 | |
| 154 | cp_status.labels(edge['name']).set(status_codes[edge['status']['control_plane']]) |
| 155 | up_status.labels(edge['name']).set(status_codes[edge['status']['user_plane']]) |
| 156 | last_update.labels(edge['name']).set(edge['last_update']) |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 157 | if 'maintenance' in edge: |
| 158 | maint_window.labels(edge['name']).set(int(edge['maintenance']['in_window'])) |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 159 | |
Andy Bavier | a0c40aa | 2021-03-10 12:09:12 -0700 | [diff] [blame^] | 160 | connect_test_ok.labels(edge['name']).set(0) |
| 161 | connect_test_down.labels(edge['name']).set(0) |
| 162 | if edge['status']['control_plane'] == "connected": |
| 163 | connect_test_ok.labels(edge['name']).set(1) |
| 164 | if edge['status']['control_plane'] in ["error", "no result"]: |
| 165 | connect_test_down.labels(edge['name']).set(1) |
| 166 | |
| 167 | ping_test_ok.labels(edge['name']).set(0) |
| 168 | ping_test_down.labels(edge['name']).set(0) |
| 169 | if edge['status']['user_plane'] == "connected": |
| 170 | ping_test_ok.labels(edge['name']).set(1) |
| 171 | if edge['status']['user_plane'] in ["error", "no result"]: |
| 172 | ping_test_down.labels(edge['name']).set(1) |
Andy Bavier | 5b4e28f | 2021-03-09 15:48:20 -0700 | [diff] [blame] | 173 | |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 174 | res.append(prom.generate_latest(cp_status)) |
| 175 | res.append(prom.generate_latest(up_status)) |
| 176 | res.append(prom.generate_latest(last_update)) |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 177 | res.append(prom.generate_latest(maint_window)) |
Andy Bavier | a0c40aa | 2021-03-10 12:09:12 -0700 | [diff] [blame^] | 178 | res.append(prom.generate_latest(connect_test_ok)) |
| 179 | res.append(prom.generate_latest(connect_test_down)) |
| 180 | res.append(prom.generate_latest(ping_test_ok)) |
| 181 | res.append(prom.generate_latest(ping_test_down)) |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 182 | |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 183 | return Response(res, mimetype="text/plain") |
| 184 | |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 185 | |
| 186 | @app.route('/edges/healthz', methods=['GET']) |
| 187 | def get_health(): |
| 188 | return {'message': 'healthy'} |
| 189 | |
| 190 | |
| 191 | @app.route('/edges', methods=['GET']) |
| 192 | def get_edges(): |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 193 | time_out_stale_results() |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 194 | return jsonify({'edges': edges}) |
| 195 | |
| 196 | |
| 197 | @app.route('/edges/<string:name>', methods=['GET']) |
| 198 | def get_edge(name): |
Andy Bavier | 4021a2f | 2020-07-29 12:39:47 -0700 | [diff] [blame] | 199 | time_out_stale_results() |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 200 | edge = [edge for edge in edges if edge['name'] == name] |
| 201 | if len(edge) == 0: |
| 202 | abort(404) |
| 203 | return jsonify({'edge': edge[0]}) |
| 204 | |
| 205 | |
| 206 | @app.route('/edges', methods=['POST']) |
| 207 | def create_or_update_edge(): |
| 208 | if not request.json: |
| 209 | abort(400) |
| 210 | if 'name' not in request.json: |
| 211 | abort(400) |
| 212 | if 'status' not in request.json: |
| 213 | abort(400) |
| 214 | |
| 215 | req_edge = { |
| 216 | 'name': request.json['name'], |
| 217 | 'status': { |
| 218 | 'control_plane': request.json['status']['control_plane'], |
| 219 | 'user_plane': request.json['status']['user_plane'] |
| 220 | }, |
| 221 | 'last_update': time.time() |
| 222 | } |
| 223 | |
| 224 | edge = [edge for edge in edges if edge['name'] == req_edge['name']] |
| 225 | if len(edge) == 0: |
| 226 | print("new edge request " + req_edge['name']) |
| 227 | edges.append(req_edge) |
| 228 | else: |
| 229 | edge[0]['status']['control_plane'] = req_edge['status']['control_plane'] |
| 230 | edge[0]['status']['user_plane'] = req_edge['status']['user_plane'] |
| 231 | edge[0]['last_update'] = req_edge['last_update'] |
| 232 | |
| 233 | return jsonify({'edge': req_edge}), 201 |
| 234 | |
| 235 | |
Hyunsun Moon | 5f237ec | 2020-09-29 14:45:52 -0700 | [diff] [blame] | 236 | @app.route('/edges/<string:name>', methods=['DELETE']) |
| 237 | def delete_edge(name): |
| 238 | print("delete edge request " + name) |
| 239 | result = False |
| 240 | for i in range(len(edges)): |
| 241 | if edges[i]['name'] == name: |
| 242 | del edges[i] |
Andy Bavier | e47157d | 2020-12-11 14:13:12 -0700 | [diff] [blame] | 243 | remove_edge_from_metrics(name) |
Hyunsun Moon | 5f237ec | 2020-09-29 14:45:52 -0700 | [diff] [blame] | 244 | result = True |
| 245 | break |
| 246 | if not result: |
| 247 | abort(404) |
| 248 | return jsonify({'result': True}) |
| 249 | |
| 250 | |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 251 | if __name__ == '__main__': |
Andy Bavier | 8a5c987 | 2020-10-21 13:17:53 -0700 | [diff] [blame] | 252 | if SECRET_ICAL_URL and AETHER_ENV: |
| 253 | print(" * Starting maintenance calendar polling thread (Aether env: %s)" % AETHER_ENV) |
Andy Bavier | 614af14 | 2020-08-07 14:49:56 -0700 | [diff] [blame] | 254 | t = threading.Thread(target=pull_maintenance_events) |
| 255 | t.start() |
Hyunsun Moon | f32ae9a | 2020-05-28 13:17:45 -0700 | [diff] [blame] | 256 | app.run(debug=True, host='0.0.0.0', port=80) |