blob: 2d0a9ed29181de2186d9614cf4c7dd141437cb5d [file] [log] [blame]
Hyunsun Moonf32ae9a2020-05-28 13:17:45 -07001#!/usr/bin/env python
2
3# Copyright 2020-present Open Networking Foundation
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
Andy Bavier614af142020-08-07 14:49:56 -070017import os
Hyunsun Moonf32ae9a2020-05-28 13:17:45 -070018import time
Andy Bavier614af142020-08-07 14:49:56 -070019import datetime
20import pytz
21import threading
22from icalevents.icalevents import events
Andy Bavier4021a2f2020-07-29 12:39:47 -070023from flask import Flask, jsonify, abort, request, Response
24import prometheus_client as prom
25
Andy Bavier8a5c9872020-10-21 13:17:53 -070026# URL of maintenance calendar
Andy Bavier614af142020-08-07 14:49:56 -070027SECRET_ICAL_URL = os.environ.get("SECRET_ICAL_URL")
Andy Bavier8a5c9872020-10-21 13:17:53 -070028
29# Aether environment that the server is monitoring (e.g., "production")
30# To schedule downtime, postfix the cluster name with the env: "ace-tucson-production"
31AETHER_ENV = os.environ.get("AETHER_ENV", "production")
32
33# Move to "no result" status if we don't hear from agent for this many seconds
Andy Bavier4021a2f2020-07-29 12:39:47 -070034NO_RESULT_THRESHOLD = 720
Hyunsun Moonf32ae9a2020-05-28 13:17:45 -070035
36app = Flask(__name__)
37edges = [
38 {
Andy Bavier8a5c9872020-10-21 13:17:53 -070039 'name': 'ace-example',
Hyunsun Moonf32ae9a2020-05-28 13:17:45 -070040 'status': {
41 'control_plane': 'connected',
42 'user_plane': 'connected'
43 },
Andy Bavier614af142020-08-07 14:49:56 -070044 'last_update': time.time(),
Hyunsun Moonf32ae9a2020-05-28 13:17:45 -070045 }
46]
47
Andy Bavier4021a2f2020-07-29 12:39:47 -070048status_codes = {
49 "no result": -2,
50 "error": -1,
51 "disconnected": 0,
52 "connecting": 1,
53 "connected": 2
54}
55
Andy Bavier614af142020-08-07 14:49:56 -070056room_mapping = {
Andy Bavier0423cbd2020-10-23 10:50:29 -070057 "ace-menlo-pixel-production": "(Compute)-MP-1-Aether Production",
58 "ace-menlo-staging": "(Compute)-MP-1-Aether Staging"
Andy Bavier614af142020-08-07 14:49:56 -070059}
60
Andy Bavier5b4e28f2021-03-09 15:48:20 -070061# Legacy test status metrics, reporting a status code between -2 and 2
Andy Bavier4021a2f2020-07-29 12:39:47 -070062cp_status = prom.Gauge("aetheredge_status_control_plane", "Control plane status code", ["name"])
63up_status = prom.Gauge("aetheredge_status_user_plane", "User plane status code", ["name"])
Andy Bavier5b4e28f2021-03-09 15:48:20 -070064
65# Simplified binary test result metrics
Andy Bavier3c7b78d2021-03-11 14:16:43 -070066e2e_tests_ok = prom.Gauge("aetheredge_e2e_tests_ok", "Last connect and ping test both passed", ["name"])
Andy Baviera0c40aa2021-03-10 12:09:12 -070067connect_test_ok = prom.Gauge("aetheredge_connect_test_ok", "Last connect test passed", ["name"])
Andy Baviera0c40aa2021-03-10 12:09:12 -070068ping_test_ok = prom.Gauge("aetheredge_ping_test_ok", "Last ping test passed", ["name"])
Andy Bavier3c7b78d2021-03-11 14:16:43 -070069e2e_tests_down = prom.Gauge("aetheredge_e2e_tests_down", "E2E tests not reporting", ["name"])
Andy Bavier5b4e28f2021-03-09 15:48:20 -070070
71# Other metrics
Andy Bavier4021a2f2020-07-29 12:39:47 -070072last_update = prom.Gauge("aetheredge_last_update", "Last reported test result", ["name"])
Andy Bavier614af142020-08-07 14:49:56 -070073maint_window = prom.Gauge("aetheredge_in_maintenance_window", "Currently in a maintenance window", ["name"])
74
75def is_my_event(event, name):
76 for field in ["summary", "location", "description"]:
Andy Bavier8a5c9872020-10-21 13:17:53 -070077 fullname = name
78 if name.startswith("ace-"):
79 fullname = "%s-%s" % (name, AETHER_ENV)
80 if fullname in getattr(event, field, ""):
Andy Bavier614af142020-08-07 14:49:56 -070081 return True
Andy Bavier0423cbd2020-10-23 10:50:29 -070082 if fullname in room_mapping and room_mapping[fullname] in getattr(event, field, ""):
83 return True
Andy Bavier614af142020-08-07 14:49:56 -070084 return False
85
Andy Bavierc41cf0c2020-09-02 14:49:21 -070086def is_naive_datetime(d):
87 return d.tzinfo is None or d.tzinfo.utcoffset(d) is None
88
89def process_all_day_events(es):
90 for event in es:
91 if event.all_day:
92 # All day events have naive datetimes, which breaks comparisons
93 pacific = pytz.timezone('US/Pacific')
94 if is_naive_datetime(event.start):
95 event.start = pacific.localize(event.start)
96 if is_naive_datetime(event.end):
97 event.end = pacific.localize(event.end)
98
Andy Bavier614af142020-08-07 14:49:56 -070099def in_maintenance_window(events, name, now):
100 for event in events:
101 if event.start < now and event.end > now:
102 if is_my_event(event, name):
103 return True
Andy Bavier614af142020-08-07 14:49:56 -0700104 return False
105
106def pull_maintenance_events():
107 while(True):
108 now = datetime.datetime.now(pytz.utc)
109 try:
110 es = events(SECRET_ICAL_URL, start = now)
Andy Bavierc41cf0c2020-09-02 14:49:21 -0700111 process_all_day_events(es)
Andy Bavier614af142020-08-07 14:49:56 -0700112 except Exception as e:
113 print(e)
114 else:
115 for edge in edges:
116 if 'maintenance' not in edge:
117 edge['maintenance'] = {}
118 edge['maintenance']['in_window'] = in_maintenance_window(es, edge['name'], now)
119 edge['maintenance']['last_update'] = time.time()
120 time.sleep(60)
Andy Bavier4021a2f2020-07-29 12:39:47 -0700121
122def time_out_stale_results():
123 for edge in edges:
124 time_elapsed = time.time() - edge["last_update"]
125 if time_elapsed > NO_RESULT_THRESHOLD:
126 edge['status']['control_plane'] = "no result"
127 edge['status']['user_plane'] = "no result"
128
Andy Baviere47157d2020-12-11 14:13:12 -0700129def remove_edge_from_metrics(name):
130 try:
131 cp_status.remove(name)
132 up_status.remove(name)
133 last_update.remove(name)
Andy Bavier3c7b78d2021-03-11 14:16:43 -0700134 e2e_tests_ok.remove(name)
Andy Baviera0c40aa2021-03-10 12:09:12 -0700135 connect_test_ok.remove(name)
Andy Baviera0c40aa2021-03-10 12:09:12 -0700136 ping_test_ok.remove(name)
Andy Bavier3c7b78d2021-03-11 14:16:43 -0700137 e2e_tests_down.remove(name)
Andy Bavier5b4e28f2021-03-09 15:48:20 -0700138 except:
139 pass
140
141 try:
Andy Baviere47157d2020-12-11 14:13:12 -0700142 maint_window.remove(name)
143 except:
144 pass
Andy Bavier4021a2f2020-07-29 12:39:47 -0700145
146@app.route('/edges/metrics', methods=['GET'])
147def get_prometheus_metrics():
148 res = []
149 time_out_stale_results()
150 for edge in edges:
Andy Bavier8a5c9872020-10-21 13:17:53 -0700151 if edge['name'] == "ace-example":
Andy Bavier4021a2f2020-07-29 12:39:47 -0700152 continue
153
Andy Bavier3c7b78d2021-03-11 14:16:43 -0700154 connect_status = edge['status']['control_plane']
155 ping_status = edge['status']['user_plane']
156
157 cp_status.labels(edge['name']).set(status_codes[connect_status])
158 up_status.labels(edge['name']).set(status_codes[ping_status])
Andy Bavier4021a2f2020-07-29 12:39:47 -0700159 last_update.labels(edge['name']).set(edge['last_update'])
Andy Bavier614af142020-08-07 14:49:56 -0700160 if 'maintenance' in edge:
161 maint_window.labels(edge['name']).set(int(edge['maintenance']['in_window']))
Andy Bavier4021a2f2020-07-29 12:39:47 -0700162
Andy Baviera0c40aa2021-03-10 12:09:12 -0700163 connect_test_ok.labels(edge['name']).set(0)
Andy Baviera0c40aa2021-03-10 12:09:12 -0700164 ping_test_ok.labels(edge['name']).set(0)
Andy Bavier3c7b78d2021-03-11 14:16:43 -0700165 e2e_tests_ok.labels(edge['name']).set(0)
166 e2e_tests_down.labels(edge['name']).set(0)
167
168 if connect_status in ["error", "no result"] or ping_status in ["error", "no result"]:
169 e2e_tests_down.labels(edge['name']).set(1)
170 else:
171 if connect_status == "connected":
172 connect_test_ok.labels(edge['name']).set(1)
173 if ping_status == "connected":
174 ping_test_ok.labels(edge['name']).set(1)
175 if connect_status == "connected" and ping_status == "connected":
176 e2e_tests_ok.labels(edge['name']).set(1)
Andy Bavier5b4e28f2021-03-09 15:48:20 -0700177
Andy Bavier4021a2f2020-07-29 12:39:47 -0700178 res.append(prom.generate_latest(cp_status))
179 res.append(prom.generate_latest(up_status))
180 res.append(prom.generate_latest(last_update))
Andy Bavier614af142020-08-07 14:49:56 -0700181 res.append(prom.generate_latest(maint_window))
Andy Baviera0c40aa2021-03-10 12:09:12 -0700182 res.append(prom.generate_latest(connect_test_ok))
Andy Baviera0c40aa2021-03-10 12:09:12 -0700183 res.append(prom.generate_latest(ping_test_ok))
Andy Bavier3c7b78d2021-03-11 14:16:43 -0700184 res.append(prom.generate_latest(e2e_tests_ok))
185 res.append(prom.generate_latest(e2e_tests_down))
Andy Bavier614af142020-08-07 14:49:56 -0700186
Andy Bavier4021a2f2020-07-29 12:39:47 -0700187 return Response(res, mimetype="text/plain")
188
Hyunsun Moonf32ae9a2020-05-28 13:17:45 -0700189
190@app.route('/edges/healthz', methods=['GET'])
191def get_health():
192 return {'message': 'healthy'}
193
194
195@app.route('/edges', methods=['GET'])
196def get_edges():
Andy Bavier4021a2f2020-07-29 12:39:47 -0700197 time_out_stale_results()
Hyunsun Moonf32ae9a2020-05-28 13:17:45 -0700198 return jsonify({'edges': edges})
199
200
201@app.route('/edges/<string:name>', methods=['GET'])
202def get_edge(name):
Andy Bavier4021a2f2020-07-29 12:39:47 -0700203 time_out_stale_results()
Hyunsun Moonf32ae9a2020-05-28 13:17:45 -0700204 edge = [edge for edge in edges if edge['name'] == name]
205 if len(edge) == 0:
206 abort(404)
207 return jsonify({'edge': edge[0]})
208
209
210@app.route('/edges', methods=['POST'])
211def create_or_update_edge():
212 if not request.json:
213 abort(400)
214 if 'name' not in request.json:
215 abort(400)
216 if 'status' not in request.json:
217 abort(400)
218
219 req_edge = {
220 'name': request.json['name'],
221 'status': {
222 'control_plane': request.json['status']['control_plane'],
223 'user_plane': request.json['status']['user_plane']
224 },
225 'last_update': time.time()
226 }
227
228 edge = [edge for edge in edges if edge['name'] == req_edge['name']]
229 if len(edge) == 0:
230 print("new edge request " + req_edge['name'])
231 edges.append(req_edge)
232 else:
233 edge[0]['status']['control_plane'] = req_edge['status']['control_plane']
234 edge[0]['status']['user_plane'] = req_edge['status']['user_plane']
235 edge[0]['last_update'] = req_edge['last_update']
236
237 return jsonify({'edge': req_edge}), 201
238
239
Hyunsun Moon5f237ec2020-09-29 14:45:52 -0700240@app.route('/edges/<string:name>', methods=['DELETE'])
241def delete_edge(name):
242 print("delete edge request " + name)
243 result = False
244 for i in range(len(edges)):
245 if edges[i]['name'] == name:
246 del edges[i]
Andy Baviere47157d2020-12-11 14:13:12 -0700247 remove_edge_from_metrics(name)
Hyunsun Moon5f237ec2020-09-29 14:45:52 -0700248 result = True
249 break
250 if not result:
251 abort(404)
252 return jsonify({'result': True})
253
254
Hyunsun Moonf32ae9a2020-05-28 13:17:45 -0700255if __name__ == '__main__':
Andy Bavier8a5c9872020-10-21 13:17:53 -0700256 if SECRET_ICAL_URL and AETHER_ENV:
257 print(" * Starting maintenance calendar polling thread (Aether env: %s)" % AETHER_ENV)
Andy Bavier614af142020-08-07 14:49:56 -0700258 t = threading.Thread(target=pull_maintenance_events)
259 t.start()
Hyunsun Moonf32ae9a2020-05-28 13:17:45 -0700260 app.run(debug=True, host='0.0.0.0', port=80)