Init commit for standalone enodebd

Change-Id: I88eeef5135dd7ba8551ddd9fb6a0695f5325337b
diff --git a/common/health/__init__.py b/common/health/__init__.py
new file mode 100644
index 0000000..5c6cb64
--- /dev/null
+++ b/common/health/__init__.py
@@ -0,0 +1,12 @@
+"""
+Copyright 2020 The Magma Authors.
+
+This source code is licensed under the BSD-style license found in the
+LICENSE file in the root directory of this source tree.
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
diff --git a/common/health/docker_health_service.py b/common/health/docker_health_service.py
new file mode 100644
index 0000000..70a728a
--- /dev/null
+++ b/common/health/docker_health_service.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+
+"""
+Copyright 2020 The Magma Authors.
+
+This source code is licensed under the BSD-style license found in the
+LICENSE file in the root directory of this source tree.
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from datetime import datetime
+
+import dateutil.parser
+import docker
+from common.health.entities import Errors, ServiceHealth, Version
+from common.health.health_service import GenericHealthChecker
+
+
+class DockerHealthChecker(GenericHealthChecker):
+
+    def get_error_summary(self, service_names):
+        res = {}
+        for service_name in service_names:
+            client = docker.from_env()
+            container = client.containers.get(service_name)
+
+            res[service_name] = Errors(log_level='-', error_count=0)
+            for line in container.logs().decode('utf-8').split('\n'):
+                if service_name not in line:
+                    continue
+                # Reset the counter for restart/start
+                if 'Starting {}...'.format(service_name) in line:
+                    res[service_name].error_count = 0
+                elif 'ERROR' in line:
+                    res[service_name].error_count += 1
+        return res
+
+    def get_magma_services_summary(self):
+        services_health_summary = []
+        client = docker.from_env()
+
+        for container in client.containers.list():
+            service_start_time = dateutil.parser.parse(
+                container.attrs['State']['StartedAt'],
+            )
+            current_time = datetime.now(service_start_time.tzinfo)
+            time_running = current_time - service_start_time
+            services_health_summary.append(
+                ServiceHealth(
+                    service_name=container.name,
+                    active_state=container.status,
+                    sub_state=container.status,
+                    time_running=str(time_running).split('.', 1)[0],
+                    errors=self.get_error_summary([container.name])[
+                        container.name
+                    ],
+                ),
+            )
+        return services_health_summary
+
+    def get_magma_version(self):
+        client = docker.from_env()
+        container = client.containers.get('magmad')
+
+        return Version(
+            version_code=container.attrs['Config']['Image'],
+            last_update_time='-',
+        )
diff --git a/common/health/entities.py b/common/health/entities.py
new file mode 100644
index 0000000..ab32496
--- /dev/null
+++ b/common/health/entities.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+
+"""
+Copyright 2020 The Magma Authors.
+
+This source code is licensed under the BSD-style license found in the
+LICENSE file in the root directory of this source tree.
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import textwrap
+
+
+class ActiveState:
+    ACTIVE = 'active'
+    RELOADING = 'reloading'
+    INACTIVE = 'inactive'
+    FAILED = 'failed'
+    ACTIVATING = 'activating'
+    DEACTIVATING = 'deactivating'
+
+    dbus2state = {
+        b'active': ACTIVE,
+        b'reloading': RELOADING,
+        b'inactive': INACTIVE,
+        b'failed': FAILED,
+        b'activating': ACTIVATING,
+        b'deactivating': DEACTIVATING,
+    }
+
+    state2status = {
+        ACTIVE: u'\u2714',
+        RELOADING: u'\u27A4',
+        INACTIVE: u'\u2717',
+        FAILED: u'\u2717',
+        ACTIVATING: u'\u27A4',
+        DEACTIVATING: u'\u27A4',
+    }
+
+
+class Errors:
+    def __init__(self, log_level, error_count):
+        self.log_level = log_level
+        self.error_count = error_count
+
+    def __str__(self):
+        return '{}: {}'.format(self.log_level, self.error_count)
+
+
+class RestartFrequency:
+    def __init__(self, count, time_interval):
+        self.count = count
+        self.time_interval = time_interval
+
+    def __str__(self):
+        return 'Restarted {} times {}'.format(
+            self.count,
+            self.time_interval,
+        )
+
+
+class HealthStatus:
+    DOWN = 'Down'
+    UP = 'Up'
+    UNKNOWN = 'Unknown'
+
+
+class Version:
+    def __init__(self, version_code, last_update_time):
+        self.version_code = version_code
+        self.last_update_time = last_update_time
+
+    def __str__(self):
+        return '{}, last updated: {}'.format(
+            self.version_code,
+            self.last_update_time,
+        )
+
+
+class ServiceHealth:
+    def __init__(
+        self, service_name, active_state, sub_state,
+        time_running, errors,
+    ):
+        self.service_name = service_name
+        self.active_state = active_state
+        self.sub_state = sub_state
+        self.time_running = time_running
+        self.errors = errors
+
+    def __str__(self):
+        return '{} {:20} {:10} {:15} {:10} {:>10} {:>10}'.format(
+            ActiveState.state2status.get(self.active_state, '-'),
+            self.service_name,
+            self.active_state,
+            self.sub_state,
+            self.time_running,
+            self.errors.log_level,
+            self.errors.error_count,
+        )
+
+
+class HealthSummary:
+    def __init__(
+        self, version, platform,
+        services_health,
+        internet_health, dns_health,
+        unexpected_restarts,
+    ):
+        self.version = version
+        self.platform = platform
+        self.services_health = services_health
+        self.internet_health = internet_health
+        self.dns_health = dns_health
+        self.unexpected_restarts = unexpected_restarts
+
+    def __str__(self):
+        any_restarts = any([
+            restarts.count
+            for restarts in self.unexpected_restarts.values()
+        ])
+        return textwrap.dedent("""
+            Running on {}
+            Version: {}:
+              {:20} {:10} {:15} {:10} {:>10} {:>10}
+            {}
+
+            Internet health: {}
+            DNS health: {}
+
+            Restart summary:
+            {}
+        """).format(
+            self.version, self.platform,
+            'Service', 'Status', 'SubState', 'Running for', 'Log level',
+            'Errors since last restart',
+            '\n'.join([str(h) for h in self.services_health]),
+            self.internet_health, self.dns_health,
+            '\n'.join([
+                '{:20} {}'.format(name, restarts)
+                for name, restarts
+                in self.unexpected_restarts.items()
+            ])
+            if any_restarts
+            else "No restarts since the gateway started",
+        )
diff --git a/common/health/health_service.py b/common/health/health_service.py
new file mode 100644
index 0000000..4228330
--- /dev/null
+++ b/common/health/health_service.py
@@ -0,0 +1,229 @@
+#!/usr/bin/env python3
+
+"""
+Copyright 2020 The Magma Authors.
+
+This source code is licensed under the BSD-style license found in the
+LICENSE file in the root directory of this source tree.
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import asyncio
+import os
+import subprocess
+from datetime import datetime
+
+import apt
+from dateutil import tz
+from common.health.entities import (
+    ActiveState,
+    Errors,
+    HealthStatus,
+    HealthSummary,
+    RestartFrequency,
+    ServiceHealth,
+    Version,
+)
+from common.service import MagmaService
+from common.service_registry import ServiceRegistry
+from configuration.mconfig_managers import load_service_mconfig_as_json
+from magmad.metrics import UNEXPECTED_SERVICE_RESTARTS
+from magmad.service_poller import ServicePoller
+from orc8r.protos import common_pb2, magmad_pb2
+from orc8r.protos.magmad_pb2_grpc import MagmadStub
+from orc8r.protos.mconfig import mconfigs_pb2
+from pystemd.systemd1 import Unit
+
+
+class GenericHealthChecker:
+
+    def ping(self, host, num_packets=4):
+        chan = ServiceRegistry.get_rpc_channel('magmad', ServiceRegistry.LOCAL)
+        client = MagmadStub(chan)
+
+        response = client.RunNetworkTests(
+            magmad_pb2.NetworkTestRequest(
+                pings=[
+                    magmad_pb2.PingParams(
+                        host_or_ip=host,
+                        num_packets=num_packets,
+                    ),
+                ],
+            ),
+        )
+        return response.pings
+
+    def ping_status(self, host):
+        pings = self.ping(host=host, num_packets=4)[0]
+        if pings.error:
+            return HealthStatus.DOWN
+        if pings.avg_response_ms:
+            return HealthStatus.UP
+        return HealthStatus.UNKNOWN
+
+    def get_error_summary(self, service_names):
+        """Get the list of services with the error count.
+
+        Args:
+            service_names: List of service names.
+
+        Returns:
+            A dictionary with service name as a key and the Errors object
+            as a value.
+
+        Raises:
+            PermissionError: User has no permision to exectue the command
+        """
+        configs = {
+            service_name: load_service_mconfig_as_json(service_name)
+            for service_name in service_names
+        }
+        res = {
+            service_name: Errors(
+            log_level=configs[service_name].get('logLevel', 'INFO'),
+            error_count=0,
+            )
+            for service_name in service_names
+        }
+
+        syslog_path = '/var/log/syslog'
+        if not os.access(syslog_path, os.R_OK):
+            raise PermissionError(
+                'syslog is not readable. '
+                'Try `sudo chmod a+r {}`. '
+                'Or execute the command with sudo '
+                'permissions: `venvsudo`'.format(syslog_path),
+            )
+        with open(syslog_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                for service_name in service_names:
+                    if service_name not in line:
+                        continue
+                    # Reset the counter for restart/start
+                    if 'Starting {}...'.format(service_name) in line:
+                        res[service_name].error_count = 0
+                    elif 'ERROR' in line:
+                        res[service_name].error_count += 1
+        return res
+
+    def get_magma_services_summary(self):
+        """ Get health for all the running services """
+        services_health_summary = []
+
+        # DBus objects: https://www.freedesktop.org/wiki/Software/systemd/dbus/
+        chan = ServiceRegistry.get_rpc_channel('magmad', ServiceRegistry.LOCAL)
+        client = MagmadStub(chan)
+
+        configs = client.GetConfigs(common_pb2.Void())
+
+        service_names = [str(name) for name in configs.configs_by_key]
+        services_errors = self.get_error_summary(service_names=service_names)
+
+        for service_name in service_names:
+            unit = Unit(
+                'magma@{}.service'.format(service_name),
+                _autoload=True,
+            )
+            active_state = ActiveState.dbus2state[unit.Unit.ActiveState]
+            sub_state = str(unit.Unit.SubState, 'utf-8')
+            if active_state == ActiveState.ACTIVE:
+                pid = unit.Service.MainPID
+                process = subprocess.Popen(
+                    'ps -o etime= -p {}'.format(pid).split(),
+                    stdout=subprocess.PIPE,
+                )
+
+                time_running, error = process.communicate()
+                if error:
+                    raise ValueError(
+                        'Cannot get time running for the service '
+                        '{} `ps -o etime= -p {}`'
+                        .format(service_name, pid),
+                    )
+            else:
+                time_running = b'00'
+
+            services_health_summary.append(
+                ServiceHealth(
+                    service_name=service_name,
+                    active_state=active_state, sub_state=sub_state,
+                    time_running=str(time_running, 'utf-8').strip(),
+                    errors=services_errors[service_name],
+                ),
+            )
+        return services_health_summary
+
+    def get_unexpected_restart_summary(self):
+        service = MagmaService('magmad', mconfigs_pb2.MagmaD())
+        service_poller = ServicePoller(service.loop, service.config)
+        service_poller.start()
+
+        asyncio.set_event_loop(service.loop)
+
+        # noinspection PyProtectedMember
+        # pylint: disable=protected-access
+        async def fetch_info():
+            restart_frequencies = {}
+            await service_poller._get_service_info()
+            for service_name in service_poller.service_info.keys():
+                restarts = int(
+                    UNEXPECTED_SERVICE_RESTARTS
+                    .labels(service_name=service_name)
+                    ._value.get(),
+                )
+                restart_frequencies[service_name] = RestartFrequency(
+                    count=restarts,
+                    time_interval='',
+                )
+
+            return restart_frequencies
+
+        return service.loop.run_until_complete(fetch_info())
+
+    def get_kernel_version(self):
+        info, error = subprocess.Popen(
+            'uname -a'.split(),
+            stdout=subprocess.PIPE,
+        ).communicate()
+
+        if error:
+            raise ValueError('Cannot get the kernel version')
+        return str(info, 'utf-8')
+
+    def get_magma_version(self):
+        cache = apt.Cache()
+
+        # Return the python version if magma is not there
+        if 'magma' not in cache:
+            return Version(
+                version_code=cache['python3'].versions[0],
+                last_update_time='-',
+            )
+
+        pkg = str(cache['magma'].versions[0])
+        version = pkg.split('-')[0].split('=')[-1]
+        timestamp = int(pkg.split('-')[1])
+
+        return Version(
+            version_code=version,
+            last_update_time=datetime.utcfromtimestamp(timestamp)
+            .replace(tzinfo=tz.tzutc())
+            .astimezone(tz=tz.tzlocal())
+            .strftime('%Y-%m-%d %H:%M:%S'),
+        )
+
+    def get_health_summary(self):
+
+        return HealthSummary(
+            version=self.get_magma_version(),
+            platform=self.get_kernel_version(),
+            services_health=self.get_magma_services_summary(),
+            internet_health=self.ping_status(host='8.8.8.8'),
+            dns_health=self.ping_status(host='google.com'),
+            unexpected_restarts=self.get_unexpected_restart_summary(),
+        )
diff --git a/common/health/service_state_wrapper.py b/common/health/service_state_wrapper.py
new file mode 100644
index 0000000..7c1f707
--- /dev/null
+++ b/common/health/service_state_wrapper.py
@@ -0,0 +1,88 @@
+"""
+Copyright 2020 The Magma Authors.
+
+This source code is licensed under the BSD-style license found in the
+LICENSE file in the root directory of this source tree.
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from common.redis.client import get_default_client
+from common.redis.containers import RedisFlatDict
+from common.redis.serializers import (
+    RedisSerde,
+    get_proto_deserializer,
+    get_proto_serializer,
+)
+from orc8r.protos.service_status_pb2 import ServiceExitStatus
+
+
+class ServiceStateWrapper:
+    """
+    Class wraps ServiceState interactions with redis
+    """
+
+    # Unique typename for Redis key
+    REDIS_VALUE_TYPE = "systemd_status"
+
+    def __init__(self):
+        serde = RedisSerde(
+            self.REDIS_VALUE_TYPE,
+            get_proto_serializer(),
+            get_proto_deserializer(ServiceExitStatus),
+        )
+        self._flat_dict = RedisFlatDict(get_default_client(), serde)
+
+    def update_service_status(
+        self, service_name: str,
+        service_status: ServiceExitStatus,
+    ) -> None:
+        """
+        Update the service exit status for a given service
+        """
+
+        if service_name in self._flat_dict:
+            current_service_status = self._flat_dict[service_name]
+        else:
+            current_service_status = ServiceExitStatus()
+
+        if service_status.latest_service_result == \
+                ServiceExitStatus.ServiceResult.Value("SUCCESS"):
+            service_status.num_clean_exits = \
+                current_service_status.num_clean_exits + 1
+            service_status.num_fail_exits = \
+                current_service_status.num_fail_exits
+        else:
+            service_status.num_fail_exits = \
+                current_service_status.num_fail_exits + 1
+            service_status.num_clean_exits = \
+                current_service_status.num_clean_exits
+        self._flat_dict[service_name] = service_status
+
+    def get_service_status(self, service_name: str) -> ServiceExitStatus:
+        """
+        Get the service status protobuf for a given service
+        @returns ServiceStatus protobuf object
+        """
+        return self._flat_dict[service_name]
+
+    def get_all_services_status(self) -> [str, ServiceExitStatus]:
+        """
+        Get a dict of service name to service status
+        @return dict of service_name to service map
+        """
+        service_status = {}
+        for k, v in self._flat_dict.items():
+            service_status[k] = v
+        return service_status
+
+    def cleanup_service_status(self) -> None:
+        """
+        Cleanup service status for all services in redis, mostly using for
+        testing
+        """
+        self._flat_dict.clear()