blob: 7e55e37a716a30f0e4e9d3019a3e0ff6234bfe36 [file] [log] [blame]
Wei-Yu Chenad55cb82022-02-15 20:07:01 +08001# !/usr/bin/env python3
Wei-Yu Chen49950b92021-11-08 19:19:18 +08002
Wei-Yu Chenad55cb82022-02-15 20:07:01 +08003# SPDX-FileCopyrightText: 2020 The Magma Authors.
4# SPDX-FileCopyrightText: 2022 Open Networking Foundation <support@opennetworking.org>
5#
6# SPDX-License-Identifier: BSD-3-Clause
Wei-Yu Chen49950b92021-11-08 19:19:18 +08007
8import asyncio
9import os
10import subprocess
11from datetime import datetime
12
13import apt
14from dateutil import tz
15from common.health.entities import (
16 ActiveState,
17 Errors,
18 HealthStatus,
19 HealthSummary,
20 RestartFrequency,
21 ServiceHealth,
22 Version,
23)
24from common.service import MagmaService
25from common.service_registry import ServiceRegistry
26from configuration.mconfig_managers import load_service_mconfig_as_json
27from magmad.metrics import UNEXPECTED_SERVICE_RESTARTS
28from magmad.service_poller import ServicePoller
29from orc8r.protos import common_pb2, magmad_pb2
30from orc8r.protos.magmad_pb2_grpc import MagmadStub
31from orc8r.protos.mconfig import mconfigs_pb2
32from pystemd.systemd1 import Unit
33
34
35class GenericHealthChecker:
36
37 def ping(self, host, num_packets=4):
38 chan = ServiceRegistry.get_rpc_channel('magmad', ServiceRegistry.LOCAL)
39 client = MagmadStub(chan)
40
41 response = client.RunNetworkTests(
42 magmad_pb2.NetworkTestRequest(
43 pings=[
44 magmad_pb2.PingParams(
45 host_or_ip=host,
46 num_packets=num_packets,
47 ),
48 ],
49 ),
50 )
51 return response.pings
52
53 def ping_status(self, host):
54 pings = self.ping(host=host, num_packets=4)[0]
55 if pings.error:
56 return HealthStatus.DOWN
57 if pings.avg_response_ms:
58 return HealthStatus.UP
59 return HealthStatus.UNKNOWN
60
61 def get_error_summary(self, service_names):
62 """Get the list of services with the error count.
63
64 Args:
65 service_names: List of service names.
66
67 Returns:
68 A dictionary with service name as a key and the Errors object
69 as a value.
70
71 Raises:
72 PermissionError: User has no permision to exectue the command
73 """
74 configs = {
75 service_name: load_service_mconfig_as_json(service_name)
76 for service_name in service_names
77 }
78 res = {
79 service_name: Errors(
80 log_level=configs[service_name].get('logLevel', 'INFO'),
81 error_count=0,
82 )
83 for service_name in service_names
84 }
85
86 syslog_path = '/var/log/syslog'
87 if not os.access(syslog_path, os.R_OK):
88 raise PermissionError(
89 'syslog is not readable. '
90 'Try `sudo chmod a+r {}`. '
91 'Or execute the command with sudo '
92 'permissions: `venvsudo`'.format(syslog_path),
93 )
94 with open(syslog_path, 'r', encoding='utf-8') as f:
95 for line in f:
96 for service_name in service_names:
97 if service_name not in line:
98 continue
99 # Reset the counter for restart/start
100 if 'Starting {}...'.format(service_name) in line:
101 res[service_name].error_count = 0
102 elif 'ERROR' in line:
103 res[service_name].error_count += 1
104 return res
105
106 def get_magma_services_summary(self):
107 """ Get health for all the running services """
108 services_health_summary = []
109
110 # DBus objects: https://www.freedesktop.org/wiki/Software/systemd/dbus/
111 chan = ServiceRegistry.get_rpc_channel('magmad', ServiceRegistry.LOCAL)
112 client = MagmadStub(chan)
113
114 configs = client.GetConfigs(common_pb2.Void())
115
116 service_names = [str(name) for name in configs.configs_by_key]
117 services_errors = self.get_error_summary(service_names=service_names)
118
119 for service_name in service_names:
120 unit = Unit(
121 'magma@{}.service'.format(service_name),
122 _autoload=True,
123 )
124 active_state = ActiveState.dbus2state[unit.Unit.ActiveState]
125 sub_state = str(unit.Unit.SubState, 'utf-8')
126 if active_state == ActiveState.ACTIVE:
127 pid = unit.Service.MainPID
128 process = subprocess.Popen(
129 'ps -o etime= -p {}'.format(pid).split(),
130 stdout=subprocess.PIPE,
131 )
132
133 time_running, error = process.communicate()
134 if error:
135 raise ValueError(
136 'Cannot get time running for the service '
137 '{} `ps -o etime= -p {}`'
138 .format(service_name, pid),
139 )
140 else:
141 time_running = b'00'
142
143 services_health_summary.append(
144 ServiceHealth(
145 service_name=service_name,
146 active_state=active_state, sub_state=sub_state,
147 time_running=str(time_running, 'utf-8').strip(),
148 errors=services_errors[service_name],
149 ),
150 )
151 return services_health_summary
152
153 def get_unexpected_restart_summary(self):
154 service = MagmaService('magmad', mconfigs_pb2.MagmaD())
155 service_poller = ServicePoller(service.loop, service.config)
156 service_poller.start()
157
158 asyncio.set_event_loop(service.loop)
159
160 # noinspection PyProtectedMember
161 # pylint: disable=protected-access
162 async def fetch_info():
163 restart_frequencies = {}
164 await service_poller._get_service_info()
165 for service_name in service_poller.service_info.keys():
166 restarts = int(
167 UNEXPECTED_SERVICE_RESTARTS
168 .labels(service_name=service_name)
169 ._value.get(),
170 )
171 restart_frequencies[service_name] = RestartFrequency(
172 count=restarts,
173 time_interval='',
174 )
175
176 return restart_frequencies
177
178 return service.loop.run_until_complete(fetch_info())
179
180 def get_kernel_version(self):
181 info, error = subprocess.Popen(
182 'uname -a'.split(),
183 stdout=subprocess.PIPE,
184 ).communicate()
185
186 if error:
187 raise ValueError('Cannot get the kernel version')
188 return str(info, 'utf-8')
189
190 def get_magma_version(self):
191 cache = apt.Cache()
192
193 # Return the python version if magma is not there
194 if 'magma' not in cache:
195 return Version(
196 version_code=cache['python3'].versions[0],
197 last_update_time='-',
198 )
199
200 pkg = str(cache['magma'].versions[0])
201 version = pkg.split('-')[0].split('=')[-1]
202 timestamp = int(pkg.split('-')[1])
203
204 return Version(
205 version_code=version,
206 last_update_time=datetime.utcfromtimestamp(timestamp)
207 .replace(tzinfo=tz.tzutc())
208 .astimezone(tz=tz.tzlocal())
209 .strftime('%Y-%m-%d %H:%M:%S'),
210 )
211
212 def get_health_summary(self):
213
214 return HealthSummary(
215 version=self.get_magma_version(),
216 platform=self.get_kernel_version(),
217 services_health=self.get_magma_services_summary(),
218 internet_health=self.ping_status(host='8.8.8.8'),
219 dns_health=self.ping_status(host='google.com'),
220 unexpected_restarts=self.get_unexpected_restart_summary(),
221 )