Wei-Yu Chen | 49950b9 | 2021-11-08 19:19:18 +0800 | [diff] [blame^] | 1 | #!/usr/bin/env python3 |
| 2 | |
| 3 | """ |
| 4 | Copyright 2020 The Magma Authors. |
| 5 | |
| 6 | This source code is licensed under the BSD-style license found in the |
| 7 | LICENSE file in the root directory of this source tree. |
| 8 | |
| 9 | Unless required by applicable law or agreed to in writing, software |
| 10 | distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | See the License for the specific language governing permissions and |
| 13 | limitations under the License. |
| 14 | """ |
| 15 | |
| 16 | import asyncio |
| 17 | import os |
| 18 | import subprocess |
| 19 | from datetime import datetime |
| 20 | |
| 21 | import apt |
| 22 | from dateutil import tz |
| 23 | from common.health.entities import ( |
| 24 | ActiveState, |
| 25 | Errors, |
| 26 | HealthStatus, |
| 27 | HealthSummary, |
| 28 | RestartFrequency, |
| 29 | ServiceHealth, |
| 30 | Version, |
| 31 | ) |
| 32 | from common.service import MagmaService |
| 33 | from common.service_registry import ServiceRegistry |
| 34 | from configuration.mconfig_managers import load_service_mconfig_as_json |
| 35 | from magmad.metrics import UNEXPECTED_SERVICE_RESTARTS |
| 36 | from magmad.service_poller import ServicePoller |
| 37 | from orc8r.protos import common_pb2, magmad_pb2 |
| 38 | from orc8r.protos.magmad_pb2_grpc import MagmadStub |
| 39 | from orc8r.protos.mconfig import mconfigs_pb2 |
| 40 | from pystemd.systemd1 import Unit |
| 41 | |
| 42 | |
| 43 | class GenericHealthChecker: |
| 44 | |
| 45 | def ping(self, host, num_packets=4): |
| 46 | chan = ServiceRegistry.get_rpc_channel('magmad', ServiceRegistry.LOCAL) |
| 47 | client = MagmadStub(chan) |
| 48 | |
| 49 | response = client.RunNetworkTests( |
| 50 | magmad_pb2.NetworkTestRequest( |
| 51 | pings=[ |
| 52 | magmad_pb2.PingParams( |
| 53 | host_or_ip=host, |
| 54 | num_packets=num_packets, |
| 55 | ), |
| 56 | ], |
| 57 | ), |
| 58 | ) |
| 59 | return response.pings |
| 60 | |
| 61 | def ping_status(self, host): |
| 62 | pings = self.ping(host=host, num_packets=4)[0] |
| 63 | if pings.error: |
| 64 | return HealthStatus.DOWN |
| 65 | if pings.avg_response_ms: |
| 66 | return HealthStatus.UP |
| 67 | return HealthStatus.UNKNOWN |
| 68 | |
| 69 | def get_error_summary(self, service_names): |
| 70 | """Get the list of services with the error count. |
| 71 | |
| 72 | Args: |
| 73 | service_names: List of service names. |
| 74 | |
| 75 | Returns: |
| 76 | A dictionary with service name as a key and the Errors object |
| 77 | as a value. |
| 78 | |
| 79 | Raises: |
| 80 | PermissionError: User has no permision to exectue the command |
| 81 | """ |
| 82 | configs = { |
| 83 | service_name: load_service_mconfig_as_json(service_name) |
| 84 | for service_name in service_names |
| 85 | } |
| 86 | res = { |
| 87 | service_name: Errors( |
| 88 | log_level=configs[service_name].get('logLevel', 'INFO'), |
| 89 | error_count=0, |
| 90 | ) |
| 91 | for service_name in service_names |
| 92 | } |
| 93 | |
| 94 | syslog_path = '/var/log/syslog' |
| 95 | if not os.access(syslog_path, os.R_OK): |
| 96 | raise PermissionError( |
| 97 | 'syslog is not readable. ' |
| 98 | 'Try `sudo chmod a+r {}`. ' |
| 99 | 'Or execute the command with sudo ' |
| 100 | 'permissions: `venvsudo`'.format(syslog_path), |
| 101 | ) |
| 102 | with open(syslog_path, 'r', encoding='utf-8') as f: |
| 103 | for line in f: |
| 104 | for service_name in service_names: |
| 105 | if service_name not in line: |
| 106 | continue |
| 107 | # Reset the counter for restart/start |
| 108 | if 'Starting {}...'.format(service_name) in line: |
| 109 | res[service_name].error_count = 0 |
| 110 | elif 'ERROR' in line: |
| 111 | res[service_name].error_count += 1 |
| 112 | return res |
| 113 | |
| 114 | def get_magma_services_summary(self): |
| 115 | """ Get health for all the running services """ |
| 116 | services_health_summary = [] |
| 117 | |
| 118 | # DBus objects: https://www.freedesktop.org/wiki/Software/systemd/dbus/ |
| 119 | chan = ServiceRegistry.get_rpc_channel('magmad', ServiceRegistry.LOCAL) |
| 120 | client = MagmadStub(chan) |
| 121 | |
| 122 | configs = client.GetConfigs(common_pb2.Void()) |
| 123 | |
| 124 | service_names = [str(name) for name in configs.configs_by_key] |
| 125 | services_errors = self.get_error_summary(service_names=service_names) |
| 126 | |
| 127 | for service_name in service_names: |
| 128 | unit = Unit( |
| 129 | 'magma@{}.service'.format(service_name), |
| 130 | _autoload=True, |
| 131 | ) |
| 132 | active_state = ActiveState.dbus2state[unit.Unit.ActiveState] |
| 133 | sub_state = str(unit.Unit.SubState, 'utf-8') |
| 134 | if active_state == ActiveState.ACTIVE: |
| 135 | pid = unit.Service.MainPID |
| 136 | process = subprocess.Popen( |
| 137 | 'ps -o etime= -p {}'.format(pid).split(), |
| 138 | stdout=subprocess.PIPE, |
| 139 | ) |
| 140 | |
| 141 | time_running, error = process.communicate() |
| 142 | if error: |
| 143 | raise ValueError( |
| 144 | 'Cannot get time running for the service ' |
| 145 | '{} `ps -o etime= -p {}`' |
| 146 | .format(service_name, pid), |
| 147 | ) |
| 148 | else: |
| 149 | time_running = b'00' |
| 150 | |
| 151 | services_health_summary.append( |
| 152 | ServiceHealth( |
| 153 | service_name=service_name, |
| 154 | active_state=active_state, sub_state=sub_state, |
| 155 | time_running=str(time_running, 'utf-8').strip(), |
| 156 | errors=services_errors[service_name], |
| 157 | ), |
| 158 | ) |
| 159 | return services_health_summary |
| 160 | |
| 161 | def get_unexpected_restart_summary(self): |
| 162 | service = MagmaService('magmad', mconfigs_pb2.MagmaD()) |
| 163 | service_poller = ServicePoller(service.loop, service.config) |
| 164 | service_poller.start() |
| 165 | |
| 166 | asyncio.set_event_loop(service.loop) |
| 167 | |
| 168 | # noinspection PyProtectedMember |
| 169 | # pylint: disable=protected-access |
| 170 | async def fetch_info(): |
| 171 | restart_frequencies = {} |
| 172 | await service_poller._get_service_info() |
| 173 | for service_name in service_poller.service_info.keys(): |
| 174 | restarts = int( |
| 175 | UNEXPECTED_SERVICE_RESTARTS |
| 176 | .labels(service_name=service_name) |
| 177 | ._value.get(), |
| 178 | ) |
| 179 | restart_frequencies[service_name] = RestartFrequency( |
| 180 | count=restarts, |
| 181 | time_interval='', |
| 182 | ) |
| 183 | |
| 184 | return restart_frequencies |
| 185 | |
| 186 | return service.loop.run_until_complete(fetch_info()) |
| 187 | |
| 188 | def get_kernel_version(self): |
| 189 | info, error = subprocess.Popen( |
| 190 | 'uname -a'.split(), |
| 191 | stdout=subprocess.PIPE, |
| 192 | ).communicate() |
| 193 | |
| 194 | if error: |
| 195 | raise ValueError('Cannot get the kernel version') |
| 196 | return str(info, 'utf-8') |
| 197 | |
| 198 | def get_magma_version(self): |
| 199 | cache = apt.Cache() |
| 200 | |
| 201 | # Return the python version if magma is not there |
| 202 | if 'magma' not in cache: |
| 203 | return Version( |
| 204 | version_code=cache['python3'].versions[0], |
| 205 | last_update_time='-', |
| 206 | ) |
| 207 | |
| 208 | pkg = str(cache['magma'].versions[0]) |
| 209 | version = pkg.split('-')[0].split('=')[-1] |
| 210 | timestamp = int(pkg.split('-')[1]) |
| 211 | |
| 212 | return Version( |
| 213 | version_code=version, |
| 214 | last_update_time=datetime.utcfromtimestamp(timestamp) |
| 215 | .replace(tzinfo=tz.tzutc()) |
| 216 | .astimezone(tz=tz.tzlocal()) |
| 217 | .strftime('%Y-%m-%d %H:%M:%S'), |
| 218 | ) |
| 219 | |
| 220 | def get_health_summary(self): |
| 221 | |
| 222 | return HealthSummary( |
| 223 | version=self.get_magma_version(), |
| 224 | platform=self.get_kernel_version(), |
| 225 | services_health=self.get_magma_services_summary(), |
| 226 | internet_health=self.ping_status(host='8.8.8.8'), |
| 227 | dns_health=self.ping_status(host='google.com'), |
| 228 | unexpected_restarts=self.get_unexpected_restart_summary(), |
| 229 | ) |