Wei-Yu Chen | ad55cb8 | 2022-02-15 20:07:01 +0800 | [diff] [blame] | 1 | # !/usr/bin/env python3 |
Wei-Yu Chen | 49950b9 | 2021-11-08 19:19:18 +0800 | [diff] [blame] | 2 | |
Wei-Yu Chen | ad55cb8 | 2022-02-15 20:07:01 +0800 | [diff] [blame] | 3 | # SPDX-FileCopyrightText: 2020 The Magma Authors. |
| 4 | # SPDX-FileCopyrightText: 2022 Open Networking Foundation <support@opennetworking.org> |
| 5 | # |
| 6 | # SPDX-License-Identifier: BSD-3-Clause |
Wei-Yu Chen | 49950b9 | 2021-11-08 19:19:18 +0800 | [diff] [blame] | 7 | |
| 8 | import asyncio |
| 9 | import os |
| 10 | import subprocess |
| 11 | from datetime import datetime |
| 12 | |
| 13 | import apt |
| 14 | from dateutil import tz |
| 15 | from common.health.entities import ( |
| 16 | ActiveState, |
| 17 | Errors, |
| 18 | HealthStatus, |
| 19 | HealthSummary, |
| 20 | RestartFrequency, |
| 21 | ServiceHealth, |
| 22 | Version, |
| 23 | ) |
| 24 | from common.service import MagmaService |
| 25 | from common.service_registry import ServiceRegistry |
| 26 | from configuration.mconfig_managers import load_service_mconfig_as_json |
| 27 | from magmad.metrics import UNEXPECTED_SERVICE_RESTARTS |
| 28 | from magmad.service_poller import ServicePoller |
| 29 | from orc8r.protos import common_pb2, magmad_pb2 |
| 30 | from orc8r.protos.magmad_pb2_grpc import MagmadStub |
| 31 | from orc8r.protos.mconfig import mconfigs_pb2 |
| 32 | from pystemd.systemd1 import Unit |
| 33 | |
| 34 | |
| 35 | class GenericHealthChecker: |
| 36 | |
| 37 | def ping(self, host, num_packets=4): |
| 38 | chan = ServiceRegistry.get_rpc_channel('magmad', ServiceRegistry.LOCAL) |
| 39 | client = MagmadStub(chan) |
| 40 | |
| 41 | response = client.RunNetworkTests( |
| 42 | magmad_pb2.NetworkTestRequest( |
| 43 | pings=[ |
| 44 | magmad_pb2.PingParams( |
| 45 | host_or_ip=host, |
| 46 | num_packets=num_packets, |
| 47 | ), |
| 48 | ], |
| 49 | ), |
| 50 | ) |
| 51 | return response.pings |
| 52 | |
| 53 | def ping_status(self, host): |
| 54 | pings = self.ping(host=host, num_packets=4)[0] |
| 55 | if pings.error: |
| 56 | return HealthStatus.DOWN |
| 57 | if pings.avg_response_ms: |
| 58 | return HealthStatus.UP |
| 59 | return HealthStatus.UNKNOWN |
| 60 | |
| 61 | def get_error_summary(self, service_names): |
| 62 | """Get the list of services with the error count. |
| 63 | |
| 64 | Args: |
| 65 | service_names: List of service names. |
| 66 | |
| 67 | Returns: |
| 68 | A dictionary with service name as a key and the Errors object |
| 69 | as a value. |
| 70 | |
| 71 | Raises: |
| 72 | PermissionError: User has no permision to exectue the command |
| 73 | """ |
| 74 | configs = { |
| 75 | service_name: load_service_mconfig_as_json(service_name) |
| 76 | for service_name in service_names |
| 77 | } |
| 78 | res = { |
| 79 | service_name: Errors( |
| 80 | log_level=configs[service_name].get('logLevel', 'INFO'), |
| 81 | error_count=0, |
| 82 | ) |
| 83 | for service_name in service_names |
| 84 | } |
| 85 | |
| 86 | syslog_path = '/var/log/syslog' |
| 87 | if not os.access(syslog_path, os.R_OK): |
| 88 | raise PermissionError( |
| 89 | 'syslog is not readable. ' |
| 90 | 'Try `sudo chmod a+r {}`. ' |
| 91 | 'Or execute the command with sudo ' |
| 92 | 'permissions: `venvsudo`'.format(syslog_path), |
| 93 | ) |
| 94 | with open(syslog_path, 'r', encoding='utf-8') as f: |
| 95 | for line in f: |
| 96 | for service_name in service_names: |
| 97 | if service_name not in line: |
| 98 | continue |
| 99 | # Reset the counter for restart/start |
| 100 | if 'Starting {}...'.format(service_name) in line: |
| 101 | res[service_name].error_count = 0 |
| 102 | elif 'ERROR' in line: |
| 103 | res[service_name].error_count += 1 |
| 104 | return res |
| 105 | |
| 106 | def get_magma_services_summary(self): |
| 107 | """ Get health for all the running services """ |
| 108 | services_health_summary = [] |
| 109 | |
| 110 | # DBus objects: https://www.freedesktop.org/wiki/Software/systemd/dbus/ |
| 111 | chan = ServiceRegistry.get_rpc_channel('magmad', ServiceRegistry.LOCAL) |
| 112 | client = MagmadStub(chan) |
| 113 | |
| 114 | configs = client.GetConfigs(common_pb2.Void()) |
| 115 | |
| 116 | service_names = [str(name) for name in configs.configs_by_key] |
| 117 | services_errors = self.get_error_summary(service_names=service_names) |
| 118 | |
| 119 | for service_name in service_names: |
| 120 | unit = Unit( |
| 121 | 'magma@{}.service'.format(service_name), |
| 122 | _autoload=True, |
| 123 | ) |
| 124 | active_state = ActiveState.dbus2state[unit.Unit.ActiveState] |
| 125 | sub_state = str(unit.Unit.SubState, 'utf-8') |
| 126 | if active_state == ActiveState.ACTIVE: |
| 127 | pid = unit.Service.MainPID |
| 128 | process = subprocess.Popen( |
| 129 | 'ps -o etime= -p {}'.format(pid).split(), |
| 130 | stdout=subprocess.PIPE, |
| 131 | ) |
| 132 | |
| 133 | time_running, error = process.communicate() |
| 134 | if error: |
| 135 | raise ValueError( |
| 136 | 'Cannot get time running for the service ' |
| 137 | '{} `ps -o etime= -p {}`' |
| 138 | .format(service_name, pid), |
| 139 | ) |
| 140 | else: |
| 141 | time_running = b'00' |
| 142 | |
| 143 | services_health_summary.append( |
| 144 | ServiceHealth( |
| 145 | service_name=service_name, |
| 146 | active_state=active_state, sub_state=sub_state, |
| 147 | time_running=str(time_running, 'utf-8').strip(), |
| 148 | errors=services_errors[service_name], |
| 149 | ), |
| 150 | ) |
| 151 | return services_health_summary |
| 152 | |
| 153 | def get_unexpected_restart_summary(self): |
| 154 | service = MagmaService('magmad', mconfigs_pb2.MagmaD()) |
| 155 | service_poller = ServicePoller(service.loop, service.config) |
| 156 | service_poller.start() |
| 157 | |
| 158 | asyncio.set_event_loop(service.loop) |
| 159 | |
| 160 | # noinspection PyProtectedMember |
| 161 | # pylint: disable=protected-access |
| 162 | async def fetch_info(): |
| 163 | restart_frequencies = {} |
| 164 | await service_poller._get_service_info() |
| 165 | for service_name in service_poller.service_info.keys(): |
| 166 | restarts = int( |
| 167 | UNEXPECTED_SERVICE_RESTARTS |
| 168 | .labels(service_name=service_name) |
| 169 | ._value.get(), |
| 170 | ) |
| 171 | restart_frequencies[service_name] = RestartFrequency( |
| 172 | count=restarts, |
| 173 | time_interval='', |
| 174 | ) |
| 175 | |
| 176 | return restart_frequencies |
| 177 | |
| 178 | return service.loop.run_until_complete(fetch_info()) |
| 179 | |
| 180 | def get_kernel_version(self): |
| 181 | info, error = subprocess.Popen( |
| 182 | 'uname -a'.split(), |
| 183 | stdout=subprocess.PIPE, |
| 184 | ).communicate() |
| 185 | |
| 186 | if error: |
| 187 | raise ValueError('Cannot get the kernel version') |
| 188 | return str(info, 'utf-8') |
| 189 | |
| 190 | def get_magma_version(self): |
| 191 | cache = apt.Cache() |
| 192 | |
| 193 | # Return the python version if magma is not there |
| 194 | if 'magma' not in cache: |
| 195 | return Version( |
| 196 | version_code=cache['python3'].versions[0], |
| 197 | last_update_time='-', |
| 198 | ) |
| 199 | |
| 200 | pkg = str(cache['magma'].versions[0]) |
| 201 | version = pkg.split('-')[0].split('=')[-1] |
| 202 | timestamp = int(pkg.split('-')[1]) |
| 203 | |
| 204 | return Version( |
| 205 | version_code=version, |
| 206 | last_update_time=datetime.utcfromtimestamp(timestamp) |
| 207 | .replace(tzinfo=tz.tzutc()) |
| 208 | .astimezone(tz=tz.tzlocal()) |
| 209 | .strftime('%Y-%m-%d %H:%M:%S'), |
| 210 | ) |
| 211 | |
| 212 | def get_health_summary(self): |
| 213 | |
| 214 | return HealthSummary( |
| 215 | version=self.get_magma_version(), |
| 216 | platform=self.get_kernel_version(), |
| 217 | services_health=self.get_magma_services_summary(), |
| 218 | internet_health=self.ping_status(host='8.8.8.8'), |
| 219 | dns_health=self.ping_status(host='google.com'), |
| 220 | unexpected_restarts=self.get_unexpected_restart_summary(), |
| 221 | ) |