VOL-2514: Clean up mib download task under load

- under load mib download task is falsely tried again
  seemingly due to exception handling.  simplify a bit
  now that most work has been removed from this task

- also strobe watchdog after successful omci_cc call

- do not start pm/test jobs until mib is in sync
  also start at a random offset

- any task failures now try again in the same way and do
  not overwrite the instance deferred.
  also retry at a random offset

- one less core reason update

Change-Id: I2563f9228194c8a605e1981cb115b499d3c89c4d
diff --git a/python/adapters/brcm_openomci_onu/brcm_openomci_onu_handler.py b/python/adapters/brcm_openomci_onu/brcm_openomci_onu_handler.py
index 5ab5b9c..074f39e 100644
--- a/python/adapters/brcm_openomci_onu/brcm_openomci_onu_handler.py
+++ b/python/adapters/brcm_openomci_onu/brcm_openomci_onu_handler.py
@@ -23,6 +23,7 @@
 import arrow
 import structlog
 import json
+import random
 
 from collections import OrderedDict
 
@@ -70,7 +71,7 @@
 OP = EntityOperations
 RC = ReasonCodes
 
-_STARTUP_RETRY_WAIT = 20
+_STARTUP_RETRY_WAIT = 10
 
 
 class BrcmOpenomciOnuHandler(object):
@@ -90,7 +91,10 @@
         self.tx_id = 0
         self._enabled = False
         self.events = None
-        self.pm_metrics = None
+        self._pm_metrics = None
+        self._pm_metrics_started = False
+        self._test_request = None
+        self._test_request_started = False
         self._omcc_version = OMCCVersion.Unknown
         self._total_tcont_count = 0  # From ANI-G ME
         self._qos_flexibility = 0  # From ONT2_G ME
@@ -264,11 +268,11 @@
                 OnuOmciPmMetrics.OMCI_DEV_KEY: self._onu_omci_device
             }
             self.log.debug('create-pm-metrics', device_id=device.id, serial_number=device.serial_number)
-            self.pm_metrics = OnuPmMetrics(self.events, self.core_proxy, self.device_id,
+            self._pm_metrics = OnuPmMetrics(self.events, self.core_proxy, self.device_id,
                                            self.logical_device_id, device.serial_number,
                                            grouped=True, freq_override=False, **kwargs)
-            pm_config = self.pm_metrics.make_proto()
-            self._onu_omci_device.set_pm_config(self.pm_metrics.omci_pm.openomci_interval_pm)
+            pm_config = self._pm_metrics.make_proto()
+            self._onu_omci_device.set_pm_config(self._pm_metrics.omci_pm.openomci_interval_pm)
             self.log.info("initial-pm-config", device_id=device.id, serial_number=device.serial_number)
             yield self.core_proxy.device_pm_config_update(pm_config, init=True)
 
@@ -276,22 +280,19 @@
             self._onu_omci_device.alarm_synchronizer.set_alarm_params(mgr=self.events,
                                                                       ani_ports=[self._pon])
 
-            # Start collecting stats from the device after a brief pause
-            reactor.callLater(10, self.pm_metrics.start_collector)
-
             # Code to Run OMCI Test Action
             kwargs_omci_test_action = {
                 OmciTestRequest.DEFAULT_FREQUENCY_KEY:
                     OmciTestRequest.DEFAULT_COLLECTION_FREQUENCY
             }
             serial_number = device.serial_number
-            test_request = OmciTestRequest(self.core_proxy,
+            self._test_request = OmciTestRequest(self.core_proxy,
                                            self.omci_agent, self.device_id,
                                            AniG, serial_number,
                                            self.logical_device_id,
                                            exclusive=False,
                                            **kwargs_omci_test_action)
-            reactor.callLater(60, test_request.start_collector)
+
             self.enabled = True
         else:
             self.log.info('onu-already-activated')
@@ -491,8 +492,9 @@
                                                                'tech-profile-config-download-failure-retrying')
                     if tp_path in self._tp_service_specific_task[uni_id]:
                         del self._tp_service_specific_task[uni_id][tp_path]
-                    self._deferred = reactor.callLater(_STARTUP_RETRY_WAIT, self.load_and_configure_tech_profile,
-                                                       uni_id, tp_path)
+                    retry = _STARTUP_RETRY_WAIT * (random.randint(1,5))
+                    reactor.callLater(retry, self.load_and_configure_tech_profile,
+                                      uni_id, tp_path)
 
                 self.log.info('downloading-tech-profile-configuration')
                 # Extract the current set of TCONT and GEM Ports from the Handler's pon_port that are
@@ -562,8 +564,9 @@
                     for gp in new_gems:
                         self.pon_port.remove_gem_id(gp.gem_id, gp.direction, False)
 
-                    self._deferred = reactor.callLater(_STARTUP_RETRY_WAIT, self.load_and_configure_tech_profile,
-                                                       uni_id, tp_path)
+                    retry = _STARTUP_RETRY_WAIT * (random.randint(1,5))
+                    reactor.callLater(retry, self.load_and_configure_tech_profile,
+                                      uni_id, tp_path)
 
                 self._tp_service_specific_task[uni_id][tp_path] = \
                     BrcmTpSetupTask(self.omci_agent, self, uni_id, [], new_gems, int(tp_path.split("/")[1]))
@@ -634,9 +637,8 @@
                               _reason=_reason)
                 yield self.core_proxy.device_reason_update(self.device_id,
                                                            'tech-profile-config-delete-failure-retrying')
-                self._deferred = \
-                    self._onu_omci_device.task_runner.queue_task(self._tp_service_specific_task[uni_id][tp_path])
-                self._deferred.addCallbacks(success, failure)
+                retry = _STARTUP_RETRY_WAIT * (random.randint(1, 5))
+                reactor.callLater(retry, self.delete_tech_profile, uni_id, tp_path, alloc_id, gem_port_id)
 
             self.log.info('deleting-tech-profile-configuration')
 
@@ -661,7 +663,7 @@
     def update_pm_config(self, device, pm_config):
         # TODO: This has not been tested
         self.log.info('update_pm_config', pm_config=pm_config)
-        self.pm_metrics.update(pm_config)
+        self._pm_metrics.update(pm_config)
 
     # Calling this assumes the onu is active/ready and had at least an initial mib downloaded.   This gets called from
     # flow decomposition that ultimately comes from onos
@@ -849,9 +851,10 @@
             def failure(_reason):
                 self.log.warn('vlan-tagging-failure', uni_port=uni_port, vlan=_set_vlan_vid, tp_id=tp_id)
                 yield self.core_proxy.device_reason_update(self.device_id, 'omci-flows-failed-retrying')
-                self._vlan_filter_task = reactor.callLater(_STARTUP_RETRY_WAIT,
-                                                           self._add_vlan_filter_task, device, uni_port.port_number,
-                                                           uni_port, _set_vlan_vid, tp_id)
+                retry = _STARTUP_RETRY_WAIT * (random.randint(1,5))
+                reactor.callLater(retry,
+                                  self._add_vlan_filter_task, device, uni_port.port_number,
+                                  uni_port, _set_vlan_vid, tp_id)
 
             self.log.info('setting-vlan-tag')
             self._vlan_filter_task = BrcmVlanFilterTask(self.omci_agent, self, uni_port, _set_vlan_vid, tp_id)
@@ -1227,11 +1230,24 @@
                     self._mib_download_task = None
                     yield self.onu_active_event()
 
+                    # Start collecting stats from the device after a brief pause
+                    if not self._pm_metrics_started:
+                        self._pm_metrics_started = True
+                        pmstart = _STARTUP_RETRY_WAIT * (random.randint(1, 5))
+                        reactor.callLater(pmstart, self._pm_metrics.start_collector)
+
+                    # Start test requests after a brief pause
+                    if not self._test_request_started:
+                        self._test_request_started = True
+                        tststart = _STARTUP_RETRY_WAIT * (random.randint(1, 5))
+                        reactor.callLater(tststart, self._test_request.start_collector)
+
                 @inlineCallbacks
                 def failure(_reason):
                     self.log.warn('mib-download-failure-retrying', _reason=_reason)
                     yield self.core_proxy.device_reason_update(self.device_id, 'initial-mib-download-failure-retrying')
-                    self._deferred = reactor.callLater(_STARTUP_RETRY_WAIT, self._mib_in_sync)
+                    retry = _STARTUP_RETRY_WAIT * (random.randint(1,5))
+                    reactor.callLater(retry, self._mib_in_sync)
 
                 # start by locking all the unis till mib sync and initial mib is downloaded
                 # this way we can capture the port down/up events when we are ready
diff --git a/python/adapters/brcm_openomci_onu/omci/brcm_mib_download_task.py b/python/adapters/brcm_openomci_onu/omci/brcm_mib_download_task.py
index b0fe3d8..678fd54 100644
--- a/python/adapters/brcm_openomci_onu/omci/brcm_mib_download_task.py
+++ b/python/adapters/brcm_openomci_onu/omci/brcm_mib_download_task.py
@@ -141,8 +141,8 @@
                        error_mask=error_mask, failed_mask=failed_mask,
                        unsupported_mask=unsupported_mask)
 
+        self.strobe_watchdog()
         if status == RC.Success:
-            self.strobe_watchdog()
             return True
 
         elif status == RC.InstanceExists:
@@ -161,37 +161,17 @@
         try:
             self.log.info('perform-download')
 
-            if self._handler.enabled and len(self._handler.uni_ports) > 0:
-                yield self._handler.core_proxy.device_reason_update(self.device_id, 'performing-initial-mib-download')
+            # Provision the initial bridge configuration
+            yield self.perform_initial_bridge_setup()
 
-            try:
-                # Lock the UNI ports to prevent any alarms during initial configuration
-                # of the ONU
-                self.strobe_watchdog()
-
+            for uni_port in self._handler.uni_ports:
                 # Provision the initial bridge configuration
-                yield self.perform_initial_bridge_setup()
+                yield self.perform_uni_initial_bridge_setup(uni_port)
 
-                for uni_port in self._handler.uni_ports:
-                    # Provision the initial bridge configuration
-                    yield self.perform_uni_initial_bridge_setup(uni_port)
-
-                self.deferred.callback('initial-download-success')
-
-            except TimeoutError as e:
-                self.log.error('initial-download-failure', e=e)
-                self.deferred.errback(failure.Failure(e))
-
-            except Exception as e:
-                self.log.exception('initial-download-failure', e=e)
-                self.deferred.errback(failure.Failure(e))
-
-            else:
-                e = MibResourcesFailure('Required resources are not available',
-                                        len(self._handler.uni_ports))
-                self.deferred.errback(failure.Failure(e))
-        except BaseException as e:
-            self.log.debug('cannot-start-mib-download', exception=e)
+            self.deferred.callback('initial-download-success')
+        except Exception as e:
+            self.log.error('initial-download-failure', e=e)
+            self.deferred.errback(failure.Failure(e))
 
     @inlineCallbacks
     def perform_initial_bridge_setup(self):
diff --git a/python/adapters/brcm_openomci_onu/omci/brcm_uni_lock_task.py b/python/adapters/brcm_openomci_onu/omci/brcm_uni_lock_task.py
index 3884620..eb090e1 100644
--- a/python/adapters/brcm_openomci_onu/omci/brcm_uni_lock_task.py
+++ b/python/adapters/brcm_openomci_onu/omci/brcm_uni_lock_task.py
@@ -81,6 +81,7 @@
         super(BrcmUniLockTask, self).start()
         self._local_deferred = reactor.callLater(0, self.perform_lock)
 
+    @inlineCallbacks
     def perform_lock(self):
         """
         Perform the lock/unlock
@@ -99,29 +100,29 @@
                 for entity_id in pptp_list:
                     msg = PptpEthernetUniFrame(entity_id,
                                                attributes=dict(administrative_state=state))
-                    self._send_omci_msg(msg)
+                    yield self._send_omci_msg(msg)
 
                 for entity_id in veip_list:
                     msg = VeipUniFrame(entity_id,
                                        attributes=dict(administrative_state=state))
-                    self._send_omci_msg(msg)
+                    yield self._send_omci_msg(msg)
 
                 msg = OntGFrame(attributes={'administrative_state': state})
-                self._send_omci_msg(msg)
+                yield self._send_omci_msg(msg)
             else:
                 # ontg must be unlocked first, then unis
                 msg = OntGFrame(attributes={'administrative_state': state})
-                self._send_omci_msg(msg)
+                yield self._send_omci_msg(msg)
 
                 for entity_id in pptp_list:
                     msg = PptpEthernetUniFrame(entity_id,
                                                attributes=dict(administrative_state=state))
-                    self._send_omci_msg(msg)
+                    yield self._send_omci_msg(msg)
 
                 for entity_id in veip_list:
                     msg = VeipUniFrame(entity_id,
                                        attributes=dict(administrative_state=state))
-                    self._send_omci_msg(msg)
+                    yield self._send_omci_msg(msg)
 
             self.deferred.callback('setting-uni-lock-state-finished')