blob: 6d1ef2f719a2efd8240f2dfc6dad9fe3befc7eaf [file] [log] [blame]
Sergio Slobodrianf39aaf82017-02-28 16:10:16 -05001#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright 2017 the original author or authors.
5#
6# Licensed under the Apache License, Version 2.0 (the "License");
7# you may not use this file except in compliance with the License.
8# You may obtain a copy of the License at
9#
10# http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15# See the License for the specific language governing permissions and
16# limitations under the License.
17#
18
19# This is a very simple implementation of a dashboard creation service that
20# listens to the Kafka bus on the voltha.kpis toic looking for performance
21# monitoring metrics for olts. If a new olt appears on the bus the service will
22# create a dashboard for it for both packet and byte stats creating one row per
23# port/stream and in each row one panel for packet stats and one for byte
24# stats.
25#
26# TODO: Capture all of the metadata for existing dashboards from Grafana. We're
27# only capturing the device and device id from the title which is good enough
28# for now.
29# TODO: Leverage Grafana to act as a template builder simplifying the
30# specification of a template without having to resort to a separate API for
31# the dashd service. The basic premise is a dashboard with any name except
32# voltha.template is created for any device. Once happy with the dashboard it's
33# renamed voltha.template and this will automatically trigger the creation of a
34# new template to use for all dashboards. All existing dashboards are
35# immediately deleted and new ones are created using the template. The template
36# is renamed voltha.template.active and can be deleted at this point. This has
37# been started.
38
39#
40# Metadata format.
41# The metadata for each device from which relevant metrics are recieved are
42# stored in a dash_meta dictionary structure as follows.
43#
44# {<device_id1>: {
45# device:<device_type>,
46# slug:<grafana_slug>,
47# timer: <timer_val>
48# created: <creation_status>
49# ports: {
50# <port_id>:[
51# <metric1>,
52# <metric2>,
53# ...,
54# <metricN>
55# ]
56# }
57# },
58# ...
59# <device_idN>: {
60# }
61# }
62#
63
64from structlog import get_logger
65from argparse import ArgumentParser
66
67from afkak.client import KafkaClient
68from afkak.common import (
69 KafkaUnavailableError,
70 OFFSET_LATEST)
71from afkak.consumer import Consumer
72from twisted.internet import reactor
73from twisted.internet.defer import DeferredList, inlineCallbacks
74from twisted.python.failure import Failure
75from twisted.internet.task import LoopingCall
76
77from common.utils.consulhelpers import get_endpoint_from_consul
78import requests
79import json
80import re
81import sys
Sergio Slobodrian61999e52017-04-03 19:09:11 -040082import time
Sergio Slobodrianf39aaf82017-02-28 16:10:16 -050083from dashd.dash_template import DashTemplate
84
85log = get_logger()
86
87
88class DashDaemon(object):
89 def __init__(self, consul_endpoint, grafana_url, topic="voltha.heartbeat"):
90 #logging.basicConfig(
91 # format='%(asctime)s:%(name)s:' +
92 # '%(levelname)s:%(process)d:%(message)s',
93 # level=logging.INFO
94 #)
95 self.dash_meta = {}
96 self.timer_resolution = 10
97 self.timer_duration = 600
98 self.topic = topic
99 self.dash_template = DashTemplate(grafana_url)
100 self.grafana_url = grafana_url
Sergio Slobodrian61999e52017-04-03 19:09:11 -0400101 self.kafka_endpoint = None
102 self.consul_endpoint = consul_endpoint
103 while True:
104 try:
105 self.kafka_endpoint = get_endpoint_from_consul(self.consul_endpoint,
106 'kafka')
107 break
108 except:
109 log.error("unable-to-communicate-with-consul")
110 time.sleep(10)
Sergio Slobodrianf39aaf82017-02-28 16:10:16 -0500111 self.on_start_callback = None
112
113 self._client = KafkaClient(self.kafka_endpoint)
114 self._consumer_list = [] # List of consumers
115 # List of deferred returned from consumers' start() methods
116 self._consumer_d_list = []
117
118 def set_on_start_callback(self, on_start_callback):
119 # This function is currently unused, future requirements.
120 self.on_start_callback = on_start_callback
121 return self
122
123 @inlineCallbacks
124 def start(self):
125 partitions = []
126 try:
127 while not partitions:
128 yield self._client.load_metadata_for_topics(self.topic)
Sergio Slobodrian61999e52017-04-03 19:09:11 -0400129 #self._client.load_metadata_for_topics(self.topic)
Sergio Slobodrianf39aaf82017-02-28 16:10:16 -0500130 e = self._client.metadata_error_for_topic(self.topic)
131 if e:
132 log.warning('no-metadata-for-topic', error=e,
133 topic=self.topic)
134 else:
135 partitions = self._client.topic_partitions[self.topic]
Sergio Slobodrian61999e52017-04-03 19:09:11 -0400136 break
137 time.sleep(20)
Sergio Slobodrianf39aaf82017-02-28 16:10:16 -0500138 except KafkaUnavailableError:
139 log.error("unable-to-communicate-with-Kafka-brokers")
140 self.stop()
141
142 def _note_consumer_stopped(result, consumer):
143 log.info('consumer-stopped', consumer=consumer,
144 result=result)
145
146 for partition in partitions:
147 c = Consumer(self._client, self.topic, partition,
148 self.msg_processor)
149 self._consumer_list.append(c)
150 log.info('consumer-started', topic=self.topic, partition=partition)
151 d = c.start(OFFSET_LATEST)
152 d.addBoth(_note_consumer_stopped, c)
153 self._consumer_d_list.append(d)
154
155 # Now read the list of existing dashboards from Grafana and create the
156 # dictionary of dashboard timers. If we've crashed there will be
157 # dashboards there. Just add them and if they're no longer valid
158 # they'll be deleted. If they are valid then they'll persist.
159 #print("Starting main loop")
160 try:
Sergio Slobodrian61999e52017-04-03 19:09:11 -0400161 while True:
162 r = requests.get(self.grafana_url + "/datasources")
163 if r.status_code == requests.codes.ok:
164 break
165 else:
166 time.sleep(10)
167 j = r.json()
168 data_source = False
169 for i in j:
170 if i["name"] == "Voltha Stats":
171 data_source = True
172 break
173 if not data_source:
174 r = requests.post(self.grafana_url + "/datasources",
175 data = {"name":"Voltha Stats","type":"graphite",
176 "access":"proxy","url":"http://localhost:81"})
177 log.info('data-source-added',status=r.status_code, text=r.text)
178
179 while True:
180 r = requests.get(self.grafana_url + "/search?")
181 if r.status_code == requests.codes.ok:
182 break
183 else:
184 time.sleep(10)
Sergio Slobodrianf39aaf82017-02-28 16:10:16 -0500185 j = r.json()
186 for i in j:
187 # Look for dashboards that have a title of *olt.[[:hexidgit:]].
188 # These will be the ones of interest. Others should just be left
189 # alone.
190 #print(i['title'])
191 match = re.search(r'(.*olt)\.([0-9a-zA-Z]+)',i['title'])
192 if match and match.lastindex > 0:
193 #print(match.group(1), match.group(2))
194 self.dash_meta[match.group(2)] = {}
195 self.dash_meta[match.group(2)]['timer'] = self.timer_duration # 10 min
196 self.dash_meta[match.group(2)]['device'] = match.group(1)
197 self.dash_meta[match.group(2)]['created'] = False
198 self.dash_meta[match.group(2)]['ports'] = {}
199 # TODO: We should really capture all of the chart data
200 # including the rows, panels, and data points being logged.
201 # This is good enough for now though to determine if
202 # there's already a dashboard for a given device.
203
204
205 def countdown_processor():
206 # Called every X (timer_resolution) seconds to count down each of the
207 # dash timers. If a timer reaches 0 the corresponding
208 # dashboard is removed.
209 #log.info("Counting down.")
210 try:
211 for dashboard in self.dash_meta.keys():
212 #print("Counting down %s." %dashboard)
213 # Issue a log if the counter decrement is somewhat relevant
214 if(self.dash_meta[dashboard]['timer'] % 100 == 0 and \
215 self.dash_meta[dashboard]['timer'] != self.timer_duration):
216 log.info("counting-down",dashboard=dashboard,
217 timer=self.dash_meta[dashboard]['timer'])
218 self.dash_meta[dashboard]['timer'] -= self.timer_resolution
219 if self.dash_meta[dashboard]['timer'] <= 0:
220 # Delete the dashboard here
221 log.info("FIXME:-Should-delete-the-dashboard-here",
222 dashboard=dashboard)
223 pass
224 except:
225 e = sys.exc_info()
226 log.error("error", error=e)
227 # Start the dashboard countdown processor
228 log.info("starting-countdown-processor")
229 lc = LoopingCall(countdown_processor)
230 lc.start(self.timer_resolution)
231
232 @inlineCallbacks
233 def template_checker():
234 try:
235 # Called every so often (timer_resolution seconds because it's
236 # convenient) to check if a template dashboard has been defined
237 # in Grafana. If it has been, replace the built in template
238 # with the one provided
239 r = requests.get(self.grafana_url + "/search?query=template")
240 db = r.json()
241 if len(db) == 1:
242 # Apply the template
243 yield self.dash_template.apply_template(db[0])
244 elif len(db) != 0:
245 # This is an error, log it.
246 log.warning("More-than-one-template-provided-ignoring")
247 except:
248 e = sys.exc_info()
249 log.error("error", error=e)
250
251 log.info("starting-template-checker")
252 lc = LoopingCall(template_checker)
253 lc.start(self.timer_resolution)
254
255 except:
256 e = sys.exc_info()
257 log.error("error", error=e)
258
259 def stop(self):
260 log.info("\n")
261 log.info('end-of-execution-stopping-consumers')
262 # Ask each of our consumers to stop. When a consumer fully stops, it
263 # fires the deferred returned from its start() method. We saved all
264 # those deferreds away (above, in start()) in self._consumer_d_list,
265 # so now we'll use a DeferredList to wait for all of them...
266 for consumer in self._consumer_list:
267 consumer.stop()
268 dl = DeferredList(self._consumer_d_list)
269
270 # Once the consumers are all stopped, then close our client
271 def _stop_client(result):
272 if isinstance(result, Failure):
273 log.error('error', result=result)
274 else:
275 log.info('all-consumers-stopped', client=self._client)
276 self._client.close()
277 return result
278
279 dl.addBoth(_stop_client)
280
281 # And once the client is shutdown, stop the reactor
282 def _stop_reactor(result):
283 reactor.stop()
284 return result
285
286 dl.addBoth(_stop_reactor)
287
288 def check_for_dashboard(self, msg):
289 need_dash = {}
290 done = {}
291 # Extract the ids for all olt(s) in the message and do one of 2
292 # things. If it exists, reset the meta_data timer for the dashboard and
293 # if it doesn't exist add it to the array of needed dashboards.
294 metrics = json.loads(getattr(msg.message,'value'))['prefixes']
295 for key in metrics.keys():
296 match = re.search(r'voltha\.(.*olt)\.([0-9a-zA-Z]+)\.(.*)',key)
297 if match and match.lastindex > 1:
298 if match.group(2) in self.dash_meta and match.group(2) not in done:
299 # Update the delete countdown timer
300 self.dash_meta[match.group(2)]['timer'] = self.timer_duration
301 done[match.group(2)] = True
302 # Issue a log if the reset if somewhat relevant.
303 if self.dash_meta[match.group(2)]['timer'] < \
304 self.timer_duration - self.timer_resolution:
305 log.info("reset-timer",device=match.group(2))
306 #print("reset timer for: %s" %match.group(2))
307 else:
308 # No dahsboard exists,
309 need_dash[key] = metrics[key]
310 return need_dash
311
312 def create_dashboards(self, createList):
313 dataIds = "ABCDEFGHIJKLMNOP"
314 for dash in createList:
315 #log.info("creating a dashboard for: %s" % self.dash_meta[dash])
316 # Create one row per "interface"
317 # Create one panel per metric type for the time being it's one
318 # panel for byte stats and one panel for packet stats.
319 newDash = json.loads(self.dash_template.dashBoard)
320 newDash['dashboard']['title'] = self.dash_meta[dash]['device'] + \
321 '.' + dash
322 # The port is the main grouping attribute
323 for port in self.dash_meta[dash]['ports']:
324 # Add in the rows for the port specified by the template
325 for row in self.dash_template.rows:
326 r = json.loads(self.dash_template.dashRow)
327 r['title'] = re.sub(r'%port%',port, row['title'])
328 p = {}
329 # Add the panels to the row per the template
330 panelId = 1
331 for panel in self.dash_template.panels:
332 p = json.loads(self.dash_template.dashPanel)
333 p['id'] = panelId
334 panelId += 1
335 p['title'] = re.sub(r'%port%', port.upper(), panel['title'])
336 t = {}
337 dataId = 0
338 # Add the targets to the panel
339 for dpoint in sorted(self.dash_meta[dash]['ports'][port]):
340 if dpoint in panel:
341 t['refId'] = dataIds[dataId]
342 db = re.sub(r'%port%',port,panel[dpoint])
343 db = re.sub(r'%device%',
344 self.dash_meta[dash]['device'],db)
345 db = re.sub(r'%deviceId%', dash,db)
346 t['target'] = db
347 p['targets'].append(t.copy())
348 dataId += 1
349 r['panels'].append(p.copy())
350 newDash['dashboard']['rows'].append(r.copy())
351 #print("NEW DASHBOARD: ",json.dumps(newDash))
352 #print(r.json())
353 r = \
354 requests.post(self.grafana_url + "/dashboards/db",
355 json=newDash)
356 self.dash_meta[dash]['slug'] = r.json()['slug']
357 self.dash_meta[dash]['created'] = True
358 log.info("created-dashboard", slug=self.dash_meta[dash]['slug'])
359
360 def msg_processor(self, consumer, msglist):
361 try:
362 createList = []
363 for msg in msglist:
364 # Reset the timer for existing dashboards and get back a dict
365 # of of dashboards to create if any.
366 need_dash = self.check_for_dashboard(msg)
367 # Now populate the meta data for all missing dashboards
368 for key in need_dash.keys():
369 match = re.search(r'voltha\.(.*olt)\.([0-9a-zA-Z]+)\.(.*)',key)
370 if match and match.lastindex > 2:
371 if match.group(2) in self.dash_meta:
372 # The entry will have been created when the first
373 # port in the record was encountered so just
374 # populate the metrics and port info.
375 # TODO: The keys below are the names of the metrics
376 # that are in the Kafka record. This auto-discovery
377 # is fine if all that's needed are raw metrics. If
378 # metrics are "cooked" by a downstream process and
379 # subsequently fed to graphite/carbon without being
380 # re-posted to Kafka, discovery becomes impossible.
381 # In those cases and in cases where finer grain
382 # control of what's displayed is required, a config
383 # file would be necessary.
384 self.dash_meta[match.group(2)]['ports'][match.group(3)] = \
385 need_dash[key]['metrics'].keys()
386 else:
387 # Not there, create a meta-data record for the
388 # device and add this port.
389 #print("Adding meta data for", match.group(1),
390 # match.group(2))
391 createList.append(match.group(2))
392 self.dash_meta[match.group(2)] = {}
393 self.dash_meta[match.group(2)]['timer'] = 600
394 self.dash_meta[match.group(2)]['device'] = match.group(1)
395 self.dash_meta[match.group(2)]['created'] = False
396 self.dash_meta[match.group(2)]['ports'] = {}
397 #print("Adding port", match.group(3), "to", match.group(1),
398 # match.group(2))
399 self.dash_meta[match.group(2)]['ports'][match.group(3)] = \
400 need_dash[key]['metrics'].keys()
401 # Now go ahead and create the dashboards using the meta data that
402 # wwas just populated for them.
403 if len(createList) != 0: # Create any missing dashboards.
404 self.create_dashboards(createList)
405 except:
406 e = sys.exc_info()
407 log.error("error", error=e)
408
409def parse_options():
410 parser = ArgumentParser("Manage Grafana Dashboards")
411 parser.add_argument("-c", "--consul",
412 help="consul ip and port",
413 default='10.100.198.220:8500')
414
415 parser.add_argument("-t", "--topic",
416 help="topic to listen from",
417 default="voltha.kpis")
418
419 parser.add_argument("-g", "--grafana_url",
420 help="graphana api url",
421 default= "http://admin:admin@localhost:8882/api")
422
423 parser.add_argument("-k", "--kafka",
424 help="kafka bus",
425 default=None)
426
427 parser.add_argument("-s", "--host",
428 help="docker host ip",
429 default=None)
430
431 return parser.parse_args()
432
433def main():
434 logging.basicConfig(
435 format='%(asctime)s:%(name)s:' +
436 '%(levelname)s:%(process)d:%(message)s',
437 level=logging.INFO
438 )
439
440 args = parse_options()
441
442 dashd = DashDaemon(args.consul, args.grafana_url, args.topic)
443 reactor.callWhenRunning(dashd.start)
444 reactor.run()
445 log.info("completed!")
446
447
448if __name__ == "__main__":
449 main()