blob: 7e55e37a716a30f0e4e9d3019a3e0ff6234bfe36 [file] [log] [blame]
# !/usr/bin/env python3
# SPDX-FileCopyrightText: 2020 The Magma Authors.
# SPDX-FileCopyrightText: 2022 Open Networking Foundation <support@opennetworking.org>
#
# SPDX-License-Identifier: BSD-3-Clause
import asyncio
import os
import subprocess
from datetime import datetime
import apt
from dateutil import tz
from common.health.entities import (
ActiveState,
Errors,
HealthStatus,
HealthSummary,
RestartFrequency,
ServiceHealth,
Version,
)
from common.service import MagmaService
from common.service_registry import ServiceRegistry
from configuration.mconfig_managers import load_service_mconfig_as_json
from magmad.metrics import UNEXPECTED_SERVICE_RESTARTS
from magmad.service_poller import ServicePoller
from orc8r.protos import common_pb2, magmad_pb2
from orc8r.protos.magmad_pb2_grpc import MagmadStub
from orc8r.protos.mconfig import mconfigs_pb2
from pystemd.systemd1 import Unit
class GenericHealthChecker:
def ping(self, host, num_packets=4):
chan = ServiceRegistry.get_rpc_channel('magmad', ServiceRegistry.LOCAL)
client = MagmadStub(chan)
response = client.RunNetworkTests(
magmad_pb2.NetworkTestRequest(
pings=[
magmad_pb2.PingParams(
host_or_ip=host,
num_packets=num_packets,
),
],
),
)
return response.pings
def ping_status(self, host):
pings = self.ping(host=host, num_packets=4)[0]
if pings.error:
return HealthStatus.DOWN
if pings.avg_response_ms:
return HealthStatus.UP
return HealthStatus.UNKNOWN
def get_error_summary(self, service_names):
"""Get the list of services with the error count.
Args:
service_names: List of service names.
Returns:
A dictionary with service name as a key and the Errors object
as a value.
Raises:
PermissionError: User has no permision to exectue the command
"""
configs = {
service_name: load_service_mconfig_as_json(service_name)
for service_name in service_names
}
res = {
service_name: Errors(
log_level=configs[service_name].get('logLevel', 'INFO'),
error_count=0,
)
for service_name in service_names
}
syslog_path = '/var/log/syslog'
if not os.access(syslog_path, os.R_OK):
raise PermissionError(
'syslog is not readable. '
'Try `sudo chmod a+r {}`. '
'Or execute the command with sudo '
'permissions: `venvsudo`'.format(syslog_path),
)
with open(syslog_path, 'r', encoding='utf-8') as f:
for line in f:
for service_name in service_names:
if service_name not in line:
continue
# Reset the counter for restart/start
if 'Starting {}...'.format(service_name) in line:
res[service_name].error_count = 0
elif 'ERROR' in line:
res[service_name].error_count += 1
return res
def get_magma_services_summary(self):
""" Get health for all the running services """
services_health_summary = []
# DBus objects: https://www.freedesktop.org/wiki/Software/systemd/dbus/
chan = ServiceRegistry.get_rpc_channel('magmad', ServiceRegistry.LOCAL)
client = MagmadStub(chan)
configs = client.GetConfigs(common_pb2.Void())
service_names = [str(name) for name in configs.configs_by_key]
services_errors = self.get_error_summary(service_names=service_names)
for service_name in service_names:
unit = Unit(
'magma@{}.service'.format(service_name),
_autoload=True,
)
active_state = ActiveState.dbus2state[unit.Unit.ActiveState]
sub_state = str(unit.Unit.SubState, 'utf-8')
if active_state == ActiveState.ACTIVE:
pid = unit.Service.MainPID
process = subprocess.Popen(
'ps -o etime= -p {}'.format(pid).split(),
stdout=subprocess.PIPE,
)
time_running, error = process.communicate()
if error:
raise ValueError(
'Cannot get time running for the service '
'{} `ps -o etime= -p {}`'
.format(service_name, pid),
)
else:
time_running = b'00'
services_health_summary.append(
ServiceHealth(
service_name=service_name,
active_state=active_state, sub_state=sub_state,
time_running=str(time_running, 'utf-8').strip(),
errors=services_errors[service_name],
),
)
return services_health_summary
def get_unexpected_restart_summary(self):
service = MagmaService('magmad', mconfigs_pb2.MagmaD())
service_poller = ServicePoller(service.loop, service.config)
service_poller.start()
asyncio.set_event_loop(service.loop)
# noinspection PyProtectedMember
# pylint: disable=protected-access
async def fetch_info():
restart_frequencies = {}
await service_poller._get_service_info()
for service_name in service_poller.service_info.keys():
restarts = int(
UNEXPECTED_SERVICE_RESTARTS
.labels(service_name=service_name)
._value.get(),
)
restart_frequencies[service_name] = RestartFrequency(
count=restarts,
time_interval='',
)
return restart_frequencies
return service.loop.run_until_complete(fetch_info())
def get_kernel_version(self):
info, error = subprocess.Popen(
'uname -a'.split(),
stdout=subprocess.PIPE,
).communicate()
if error:
raise ValueError('Cannot get the kernel version')
return str(info, 'utf-8')
def get_magma_version(self):
cache = apt.Cache()
# Return the python version if magma is not there
if 'magma' not in cache:
return Version(
version_code=cache['python3'].versions[0],
last_update_time='-',
)
pkg = str(cache['magma'].versions[0])
version = pkg.split('-')[0].split('=')[-1]
timestamp = int(pkg.split('-')[1])
return Version(
version_code=version,
last_update_time=datetime.utcfromtimestamp(timestamp)
.replace(tzinfo=tz.tzutc())
.astimezone(tz=tz.tzlocal())
.strftime('%Y-%m-%d %H:%M:%S'),
)
def get_health_summary(self):
return HealthSummary(
version=self.get_magma_version(),
platform=self.get_kernel_version(),
services_health=self.get_magma_services_summary(),
internet_health=self.ping_status(host='8.8.8.8'),
dns_health=self.ping_status(host='google.com'),
unexpected_restarts=self.get_unexpected_restart_summary(),
)