blob: 422833051a6d177a2d40040921d19a3eec92dfcf [file] [log] [blame]
Wei-Yu Chen49950b92021-11-08 19:19:18 +08001#!/usr/bin/env python3
2
3"""
4Copyright 2020 The Magma Authors.
5
6This source code is licensed under the BSD-style license found in the
7LICENSE file in the root directory of this source tree.
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14"""
15
16import asyncio
17import os
18import subprocess
19from datetime import datetime
20
21import apt
22from dateutil import tz
23from common.health.entities import (
24 ActiveState,
25 Errors,
26 HealthStatus,
27 HealthSummary,
28 RestartFrequency,
29 ServiceHealth,
30 Version,
31)
32from common.service import MagmaService
33from common.service_registry import ServiceRegistry
34from configuration.mconfig_managers import load_service_mconfig_as_json
35from magmad.metrics import UNEXPECTED_SERVICE_RESTARTS
36from magmad.service_poller import ServicePoller
37from orc8r.protos import common_pb2, magmad_pb2
38from orc8r.protos.magmad_pb2_grpc import MagmadStub
39from orc8r.protos.mconfig import mconfigs_pb2
40from pystemd.systemd1 import Unit
41
42
43class GenericHealthChecker:
44
45 def ping(self, host, num_packets=4):
46 chan = ServiceRegistry.get_rpc_channel('magmad', ServiceRegistry.LOCAL)
47 client = MagmadStub(chan)
48
49 response = client.RunNetworkTests(
50 magmad_pb2.NetworkTestRequest(
51 pings=[
52 magmad_pb2.PingParams(
53 host_or_ip=host,
54 num_packets=num_packets,
55 ),
56 ],
57 ),
58 )
59 return response.pings
60
61 def ping_status(self, host):
62 pings = self.ping(host=host, num_packets=4)[0]
63 if pings.error:
64 return HealthStatus.DOWN
65 if pings.avg_response_ms:
66 return HealthStatus.UP
67 return HealthStatus.UNKNOWN
68
69 def get_error_summary(self, service_names):
70 """Get the list of services with the error count.
71
72 Args:
73 service_names: List of service names.
74
75 Returns:
76 A dictionary with service name as a key and the Errors object
77 as a value.
78
79 Raises:
80 PermissionError: User has no permision to exectue the command
81 """
82 configs = {
83 service_name: load_service_mconfig_as_json(service_name)
84 for service_name in service_names
85 }
86 res = {
87 service_name: Errors(
88 log_level=configs[service_name].get('logLevel', 'INFO'),
89 error_count=0,
90 )
91 for service_name in service_names
92 }
93
94 syslog_path = '/var/log/syslog'
95 if not os.access(syslog_path, os.R_OK):
96 raise PermissionError(
97 'syslog is not readable. '
98 'Try `sudo chmod a+r {}`. '
99 'Or execute the command with sudo '
100 'permissions: `venvsudo`'.format(syslog_path),
101 )
102 with open(syslog_path, 'r', encoding='utf-8') as f:
103 for line in f:
104 for service_name in service_names:
105 if service_name not in line:
106 continue
107 # Reset the counter for restart/start
108 if 'Starting {}...'.format(service_name) in line:
109 res[service_name].error_count = 0
110 elif 'ERROR' in line:
111 res[service_name].error_count += 1
112 return res
113
114 def get_magma_services_summary(self):
115 """ Get health for all the running services """
116 services_health_summary = []
117
118 # DBus objects: https://www.freedesktop.org/wiki/Software/systemd/dbus/
119 chan = ServiceRegistry.get_rpc_channel('magmad', ServiceRegistry.LOCAL)
120 client = MagmadStub(chan)
121
122 configs = client.GetConfigs(common_pb2.Void())
123
124 service_names = [str(name) for name in configs.configs_by_key]
125 services_errors = self.get_error_summary(service_names=service_names)
126
127 for service_name in service_names:
128 unit = Unit(
129 'magma@{}.service'.format(service_name),
130 _autoload=True,
131 )
132 active_state = ActiveState.dbus2state[unit.Unit.ActiveState]
133 sub_state = str(unit.Unit.SubState, 'utf-8')
134 if active_state == ActiveState.ACTIVE:
135 pid = unit.Service.MainPID
136 process = subprocess.Popen(
137 'ps -o etime= -p {}'.format(pid).split(),
138 stdout=subprocess.PIPE,
139 )
140
141 time_running, error = process.communicate()
142 if error:
143 raise ValueError(
144 'Cannot get time running for the service '
145 '{} `ps -o etime= -p {}`'
146 .format(service_name, pid),
147 )
148 else:
149 time_running = b'00'
150
151 services_health_summary.append(
152 ServiceHealth(
153 service_name=service_name,
154 active_state=active_state, sub_state=sub_state,
155 time_running=str(time_running, 'utf-8').strip(),
156 errors=services_errors[service_name],
157 ),
158 )
159 return services_health_summary
160
161 def get_unexpected_restart_summary(self):
162 service = MagmaService('magmad', mconfigs_pb2.MagmaD())
163 service_poller = ServicePoller(service.loop, service.config)
164 service_poller.start()
165
166 asyncio.set_event_loop(service.loop)
167
168 # noinspection PyProtectedMember
169 # pylint: disable=protected-access
170 async def fetch_info():
171 restart_frequencies = {}
172 await service_poller._get_service_info()
173 for service_name in service_poller.service_info.keys():
174 restarts = int(
175 UNEXPECTED_SERVICE_RESTARTS
176 .labels(service_name=service_name)
177 ._value.get(),
178 )
179 restart_frequencies[service_name] = RestartFrequency(
180 count=restarts,
181 time_interval='',
182 )
183
184 return restart_frequencies
185
186 return service.loop.run_until_complete(fetch_info())
187
188 def get_kernel_version(self):
189 info, error = subprocess.Popen(
190 'uname -a'.split(),
191 stdout=subprocess.PIPE,
192 ).communicate()
193
194 if error:
195 raise ValueError('Cannot get the kernel version')
196 return str(info, 'utf-8')
197
198 def get_magma_version(self):
199 cache = apt.Cache()
200
201 # Return the python version if magma is not there
202 if 'magma' not in cache:
203 return Version(
204 version_code=cache['python3'].versions[0],
205 last_update_time='-',
206 )
207
208 pkg = str(cache['magma'].versions[0])
209 version = pkg.split('-')[0].split('=')[-1]
210 timestamp = int(pkg.split('-')[1])
211
212 return Version(
213 version_code=version,
214 last_update_time=datetime.utcfromtimestamp(timestamp)
215 .replace(tzinfo=tz.tzutc())
216 .astimezone(tz=tz.tzlocal())
217 .strftime('%Y-%m-%d %H:%M:%S'),
218 )
219
220 def get_health_summary(self):
221
222 return HealthSummary(
223 version=self.get_magma_version(),
224 platform=self.get_kernel_version(),
225 services_health=self.get_magma_services_summary(),
226 internet_health=self.ping_status(host='8.8.8.8'),
227 dns_health=self.ping_status(host='google.com'),
228 unexpected_restarts=self.get_unexpected_restart_summary(),
229 )