Add counters

Change-Id: I7ccb042ba2759f07cbe693217ba37f987cb9fd6b
diff --git a/VERSION b/VERSION
index b4d6d12..88a7b22 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.7.11
+0.7.12
diff --git a/edge-monitoring/agent_modem/edge_monitoring_agent_modem.py b/edge-monitoring/agent_modem/edge_monitoring_agent_modem.py
index b769153..7b0739c 100755
--- a/edge-monitoring/agent_modem/edge_monitoring_agent_modem.py
+++ b/edge-monitoring/agent_modem/edge_monitoring_agent_modem.py
@@ -10,7 +10,6 @@
 import logging
 import enum
 import pycurl
-import time
 import serial
 import subprocess
 import time
@@ -32,7 +31,7 @@
 config_file_contents = config_file_contents.replace("speedtest_iperf", "iperf_server")
 config_file_contents = config_file_contents.replace("\"speedtest_ping_dns\": \"8.8.8.8\",", "")
 # replace 1.1.1.1 with 8.8.8.8
-config_file_contents = config_file_contents.replace("\"1.1.1.1\": \"8.8.8.8\",", "")
+config_file_contents = config_file_contents.replace('1.1.1.1', '8.8.8.8')
 CONF = json.loads(
     config_file_contents, object_hook=lambda d: namedtuple('X', d.keys())(*d.values())
 )
@@ -68,7 +67,7 @@
         cmd = "ls " + CONF.modem.port
         sp = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE, universal_newlines=True)
-        rc = sp.wait()
+        sp.wait()
         ret,err = sp.communicate()
         if err != "" :
             self.log.error("unable to find serial port " + err)
@@ -113,9 +112,11 @@
             return False, None
         return True, response
 
-    def get_state(self):
+    def get_state(self, counters):
         success, result = self.write('AT+CGATT?')
         if not success or 'CGATT:' not in result:
+            logging.error("AT+CGATT modem cmd failed")
+            counters['modem_cgatt_error'] += 1
             return State.error
         state = result.split('CGATT:')[1].split(' ')[0]
         return State(state)
@@ -124,11 +125,13 @@
         self.serial.close()
 
 
-def get_control_plane_state(modem, dongle_stats=None):
+def get_control_plane_state(modem, counters, dongle_stats=None):
     if not modem and dongle_stats:
-        if dongle_stats and dongle_stats['Connection'] == 'Connected':
+        if dongle_stats['Connection'] == 'Connected':
             return State.connected
         else:
+            logging.error("Dongle not connected: {}".format(dongle_stats['Connection']))
+            counters['dongle_connect_error'] += 1
             return State.disconnected
 
     # Disable radio fuction
@@ -138,14 +141,15 @@
         subprocess.check_output(
             "echo 'AT+CFUN=0' > " + modem.port, shell=True)
     except subprocess.CalledProcessError as e:
-        logging.error("Write 'AT+CFUN=0' failed")
+        logging.error("Write 'AT+CFUN=0' failed: {}".format(e))
+        counters['modem_cfun0_error'] += 1
         return State.error
 
     # Wait until the modem is fully disconnected
     retry = 0
     state = None
     while retry < CONF.detach_timeout:
-        state = modem.get_state()
+        state = modem.get_state(counters)
         if state is State.disconnected:
             break
         time.sleep(1)
@@ -163,13 +167,14 @@
         subprocess.check_output(
             "echo 'AT+CFUN=1' > " + modem.port, shell=True)
     except subprocess.CalledProcessError as e:
-        logging.error("Write 'AT+CFUN=1' failed")
+        logging.error("Write 'AT+CFUN=1' failed: {}".format(e))
+        counters['modem_cfun1_error'] += 1
         return State.error
 
     # Wait attach_timeout sec for the modem to be fully connected
     retry = 0
     while retry < CONF.attach_timeout:
-        state = modem.get_state()
+        state = modem.get_state(counters)
         if state is State.connected:
             break
         time.sleep(1)
@@ -181,26 +186,21 @@
     return state
 
 
-def get_user_plane_state():
+def dry_run_ping_test(counters):
     if "dry_run" in CONF.ips._fields and CONF.ips.dry_run: # run dry_run latency test as user plane test
-        dry_run_latency, dry_run_passed = run_ping_test(CONF.ips.dry_run, 10)
-        if dry_run_passed:
-            return State.connected, dry_run_latency
-        else:
-            logging.warning("Ping failed: {}".format(CONF.ips.dry_run))
-            return State.disconnected, dry_run_latency
+        return do_ping(CONF.ips.dry_run, 10)
     else: # run default user plane test
         try:
             subprocess.check_output(
                 "ping -c 3 " + CONF.ips.dns + ">/dev/null 2>&1",
                 shell=True)
-            return State.connected, None
+            return None, True
         except subprocess.CalledProcessError as e:
-            logging.warning("Ping failed: {}".format(CONF.ips.dns))
-            return State.disconnected, None
+            logging.warning("Ping failed for {}: {}".format(CONF.ips.dns, e))
+            return None, False
 
 
-def run_ping_test(ip, count):
+def do_ping(ip, count):
     '''
     Runs the ping test
     Input: IP to ping, # times to ping
@@ -222,6 +222,7 @@
     if not ip:
         return result, True
     try:
+        logging.debug("Pinging {}".format(ip))
         pingOutput = subprocess.check_output(
                     "ping -c " + str(count) + " " + \
                     ip, shell=True).decode("UTF-8").split()
@@ -250,7 +251,7 @@
     return result, True
 
 
-def get_ping_test(modem, dry_run_latency=None):
+def ping_test(modem, dry_run_latency=None):
     '''
     Prepares the ping test.
     Runs ping tests from 'ips' entry in config.json in order.
@@ -267,13 +268,13 @@
         if CONF.ips._fields[i] == "dry_run":
             continue
         count = 10
-        speedtest_ping[CONF.ips._fields[i]], status = run_ping_test(CONF.ips[i], count)
+        speedtest_ping[CONF.ips._fields[i]], status = do_ping(CONF.ips[i], count)
         if not status:
             ping_test_passed = False
-            logging.error("Ping test failed. Not running further tests.")
+            logging.error("Ping failed: {}".format(CONF.ips[i]))
     return speedtest_ping, ping_test_passed
 
-def run_iperf_test(ip, port, time_duration, is_downlink):
+def run_iperf_test(ip, port, time_duration, is_downlink, counters):
     '''
     Runs iperf test to specified IP in the config file.
     - Runs for 10 seconds (10 iterations)
@@ -298,13 +299,14 @@
             return result
         except Exception as e:
             err = e
+            counters['iperf_error'] += 1
             time.sleep(5)
             pass
     logging.error("After " + str(maxRetries) + " retries, iperf test failed for " + ip + ": %s", err)
     return result
 
 
-def get_iperf_test(modem):
+def iperf_test(counters):
     '''
     Prepares the iperf test.
     '''
@@ -320,19 +322,20 @@
             return None
     hour_iperf_scheduled_time_last_ran = int(time.strftime("%H"))
 
-    speedtest_iperf['cluster']['downlink'] = run_iperf_test(CONF.ips.iperf_server, CONF.iperf_port, 10, True)
-    speedtest_iperf['cluster']['uplink'] = run_iperf_test(CONF.ips.iperf_server, CONF.iperf_port, 10, False)
+    speedtest_iperf['cluster']['downlink'] = run_iperf_test(CONF.ips.iperf_server, CONF.iperf_port, 10, True, counters)
+    speedtest_iperf['cluster']['uplink'] = run_iperf_test(CONF.ips.iperf_server, CONF.iperf_port, 10, False, counters)
 
     return speedtest_iperf
 
 
-def get_signal_quality(modem, dongle_stats=None):
+def get_signal_quality(modem, counters, dongle_stats=None):
     if not modem and dongle_stats:
         if dongle_stats['RSRQ'] != '' and dongle_stats['RSRP'] != '':
             rsrq = int((float(dongle_stats['RSRQ']) + 19.5) * 2)
             rsrp = int(float(dongle_stats['RSRP']) + 140)
             return {'rsrq': rsrq, 'rsrp': rsrp}
         else:
+            counters['dongle_rsrp_rsrq_error'] += 1
             return {'rsrq': 0, 'rsrp': 0}
 
     # Fall back to modem cmds
@@ -341,9 +344,9 @@
     logging.debug("get_signal_quality success %i result %s",success,result)
     if not success or 'CESQ: ' not in result:
         logging.error("Failed to get signal quality")
+        counters['modem_cesq_error'] += 1
         return {'rsrq':0, 'rsrp':0}
 
-    logging.debug("%s", result)
     tmp_rsrq = result.split('CESQ:')[1].split(',')[4]
     tmp_rsrp = result.split('CESQ:')[1].split(',')[5]
 
@@ -357,7 +360,7 @@
     return result
 
 
-def get_dongle_stats():
+def get_dongle_stats(counters):
     result = {'SuccessfulFetch' : False}
     if "report_in_band" in CONF._fields:
         result['inBandReporting'] = CONF.report_in_band
@@ -386,22 +389,25 @@
         dongleStatsXML = ET.fromstring(subprocess.check_output("curl -u admin:admin -s 'http://192.168.0.1:8080/cgi-bin/ltestatus.cgi?Command=Status'", shell=True).decode("UTF-8"))
     except Exception as e:
         logging.error("Failed to fetch dongle stats from URL: " + str(e))
+        counters['dongle_read_error'] += 1
         return result
     try:
         for key in XMLkeys:
             try:
                 result[key] = dongleStatsXML.find(key).text
-            except AttributeError as e:
-                logging.warn("Failed to find " + key + " in XML.")
+            except AttributeError:
+                logging.error("Failed to find " + key + " in XML")
+                counters['dongle_read_error'] += 1
                 result[key] = ""
         result["SuccessfulFetch"] = True
     except Exception as e:
         logging.error("Failed to fetch dongle stats from XML: " + str(e))
+        counters['dongle_read_error'] += 1
         return result
     return result
 
 
-def report_status(signal_quality, dongle_stats, cp_state=None, up_state=None, speedtest_ping=None, speedtest_iperf=None):
+def report_status(counters, signal_quality, dongle_stats, cp_state=None, up_state=None, speedtest_ping=None, speedtest_iperf=None):
     report = {
         'name': CONF.edge_name,
         'status': {
@@ -446,6 +452,7 @@
         report['speedtest']['iperf'] = speedtest_iperf
     report['signal_quality'] = signal_quality
     report['dongle_stats'] = dongle_stats
+    report['counters'] = counters
 
     logging.info("Sending report %s", report)
     global cycles
@@ -516,6 +523,20 @@
     time.sleep(delay)
     subprocess.check_output("sudo shutdown -r now", shell=True)
 
+def init_counters():
+    return {
+        'dongle_read_error': 0,
+        'dongle_connect_error': 0,
+        'dongle_rsrp_rsrq_error': 0,
+        'modem_cfun0_error': 0,
+        'modem_cfun1_error': 0,
+        'modem_cgatt_error': 0,
+        'modem_cesq_error': 0,
+        'dry_run_ping_error': 0,
+        'ping_error': 0,
+        'iperf_error': 0
+    }
+
 def main():
     global cycles
     global hour_iperf_scheduled_time_last_ran
@@ -554,39 +575,47 @@
         modem = None
 
     connect_retries = 0
+    counters = init_counters()
     while True:
         dongle_retries = 0
-        dongle_stats = get_dongle_stats()
+        dongle_stats = get_dongle_stats(counters)
         while not dongle_stats['SuccessfulFetch']:
-            log.error("Dongle error")
             dongle_retries += 1
             if dongle_retries > 10:
-                logging.warn("Rebooting Pi")
+                logging.warning("Rebooting Pi: dongle not readable")
                 os.system("shutdown /r /t 0")
                 sys.exit(1)
-            dongle_stats = get_dongle_stats()
+            dongle_stats = get_dongle_stats(counters)
 
-        cp_state = get_control_plane_state(modem, dongle_stats)
+        cp_state = get_control_plane_state(modem, counters, dongle_stats)
 
         if cp_state != State.connected:
             logging.error("Dongle not connected")
             connect_retries += 1
             if connect_retries > 10:
+                logging.warning("Rebooting Pi: dongle not connected")
                 os.system("shutdown /r /t 0")
                 sys.exit(1)
 
-        signal_quality = get_signal_quality(modem, dongle_stats)
-        up_state, dry_run_latency = get_user_plane_state()
-        if up_state is State.disconnected:
+        signal_quality = get_signal_quality(modem, counters, dongle_stats)
+
+        dry_run_ping_latency, dry_run_ping_result = dry_run_ping_test(counters)
+        if not dry_run_ping_result:
             logging.error("Dry run ping failed")
+            counters['dry_run_ping_error'] += 1
 
-        speedtest_ping, speedtest_status = get_ping_test(modem, dry_run_latency)
-        if not speedtest_status:
-            logging.error("Ping tests failed")
+        ping_latency, ping_result = ping_test(modem, dry_run_ping_latency)
+        if not ping_result:
+            logging.error("Ping test failed")
+            counters['ping_error'] += 1
 
-        speedtest_iperf = get_iperf_test(modem)
+        # If either of the ping tests pass, then declare user plane connected
+        if dry_run_ping_result or ping_result:
+            up_state = State.connected
 
-        report_status(signal_quality, dongle_stats, cp_state, up_state, speedtest_ping, speedtest_iperf)
+        speedtest_iperf = iperf_test(counters)
+
+        report_status(counters, signal_quality, dongle_stats, cp_state, up_state, ping_latency, speedtest_iperf)
         time.sleep(CONF.report_interval)
 
     modem.close()