VOL-2514: Clean up mib download task under load
- under load mib download task is falsely tried again
seemingly due to exception handling. simplify a bit
now that most work has been removed from this task
- also strobe watchdog after successful omci_cc call
- do not start pm/test jobs until mib is in sync
also start at a random offset
- any task failures now try again in the same way and do
not overwrite the instance deferred.
also retry at a random offset
- one less core reason update
Change-Id: I2563f9228194c8a605e1981cb115b499d3c89c4d
diff --git a/python/adapters/brcm_openomci_onu/brcm_openomci_onu_handler.py b/python/adapters/brcm_openomci_onu/brcm_openomci_onu_handler.py
index 5ab5b9c..074f39e 100644
--- a/python/adapters/brcm_openomci_onu/brcm_openomci_onu_handler.py
+++ b/python/adapters/brcm_openomci_onu/brcm_openomci_onu_handler.py
@@ -23,6 +23,7 @@
import arrow
import structlog
import json
+import random
from collections import OrderedDict
@@ -70,7 +71,7 @@
OP = EntityOperations
RC = ReasonCodes
-_STARTUP_RETRY_WAIT = 20
+_STARTUP_RETRY_WAIT = 10
class BrcmOpenomciOnuHandler(object):
@@ -90,7 +91,10 @@
self.tx_id = 0
self._enabled = False
self.events = None
- self.pm_metrics = None
+ self._pm_metrics = None
+ self._pm_metrics_started = False
+ self._test_request = None
+ self._test_request_started = False
self._omcc_version = OMCCVersion.Unknown
self._total_tcont_count = 0 # From ANI-G ME
self._qos_flexibility = 0 # From ONT2_G ME
@@ -264,11 +268,11 @@
OnuOmciPmMetrics.OMCI_DEV_KEY: self._onu_omci_device
}
self.log.debug('create-pm-metrics', device_id=device.id, serial_number=device.serial_number)
- self.pm_metrics = OnuPmMetrics(self.events, self.core_proxy, self.device_id,
+ self._pm_metrics = OnuPmMetrics(self.events, self.core_proxy, self.device_id,
self.logical_device_id, device.serial_number,
grouped=True, freq_override=False, **kwargs)
- pm_config = self.pm_metrics.make_proto()
- self._onu_omci_device.set_pm_config(self.pm_metrics.omci_pm.openomci_interval_pm)
+ pm_config = self._pm_metrics.make_proto()
+ self._onu_omci_device.set_pm_config(self._pm_metrics.omci_pm.openomci_interval_pm)
self.log.info("initial-pm-config", device_id=device.id, serial_number=device.serial_number)
yield self.core_proxy.device_pm_config_update(pm_config, init=True)
@@ -276,22 +280,19 @@
self._onu_omci_device.alarm_synchronizer.set_alarm_params(mgr=self.events,
ani_ports=[self._pon])
- # Start collecting stats from the device after a brief pause
- reactor.callLater(10, self.pm_metrics.start_collector)
-
# Code to Run OMCI Test Action
kwargs_omci_test_action = {
OmciTestRequest.DEFAULT_FREQUENCY_KEY:
OmciTestRequest.DEFAULT_COLLECTION_FREQUENCY
}
serial_number = device.serial_number
- test_request = OmciTestRequest(self.core_proxy,
+ self._test_request = OmciTestRequest(self.core_proxy,
self.omci_agent, self.device_id,
AniG, serial_number,
self.logical_device_id,
exclusive=False,
**kwargs_omci_test_action)
- reactor.callLater(60, test_request.start_collector)
+
self.enabled = True
else:
self.log.info('onu-already-activated')
@@ -491,8 +492,9 @@
'tech-profile-config-download-failure-retrying')
if tp_path in self._tp_service_specific_task[uni_id]:
del self._tp_service_specific_task[uni_id][tp_path]
- self._deferred = reactor.callLater(_STARTUP_RETRY_WAIT, self.load_and_configure_tech_profile,
- uni_id, tp_path)
+ retry = _STARTUP_RETRY_WAIT * (random.randint(1,5))
+ reactor.callLater(retry, self.load_and_configure_tech_profile,
+ uni_id, tp_path)
self.log.info('downloading-tech-profile-configuration')
# Extract the current set of TCONT and GEM Ports from the Handler's pon_port that are
@@ -562,8 +564,9 @@
for gp in new_gems:
self.pon_port.remove_gem_id(gp.gem_id, gp.direction, False)
- self._deferred = reactor.callLater(_STARTUP_RETRY_WAIT, self.load_and_configure_tech_profile,
- uni_id, tp_path)
+ retry = _STARTUP_RETRY_WAIT * (random.randint(1,5))
+ reactor.callLater(retry, self.load_and_configure_tech_profile,
+ uni_id, tp_path)
self._tp_service_specific_task[uni_id][tp_path] = \
BrcmTpSetupTask(self.omci_agent, self, uni_id, [], new_gems, int(tp_path.split("/")[1]))
@@ -634,9 +637,8 @@
_reason=_reason)
yield self.core_proxy.device_reason_update(self.device_id,
'tech-profile-config-delete-failure-retrying')
- self._deferred = \
- self._onu_omci_device.task_runner.queue_task(self._tp_service_specific_task[uni_id][tp_path])
- self._deferred.addCallbacks(success, failure)
+ retry = _STARTUP_RETRY_WAIT * (random.randint(1, 5))
+ reactor.callLater(retry, self.delete_tech_profile, uni_id, tp_path, alloc_id, gem_port_id)
self.log.info('deleting-tech-profile-configuration')
@@ -661,7 +663,7 @@
def update_pm_config(self, device, pm_config):
# TODO: This has not been tested
self.log.info('update_pm_config', pm_config=pm_config)
- self.pm_metrics.update(pm_config)
+ self._pm_metrics.update(pm_config)
# Calling this assumes the onu is active/ready and had at least an initial mib downloaded. This gets called from
# flow decomposition that ultimately comes from onos
@@ -849,9 +851,10 @@
def failure(_reason):
self.log.warn('vlan-tagging-failure', uni_port=uni_port, vlan=_set_vlan_vid, tp_id=tp_id)
yield self.core_proxy.device_reason_update(self.device_id, 'omci-flows-failed-retrying')
- self._vlan_filter_task = reactor.callLater(_STARTUP_RETRY_WAIT,
- self._add_vlan_filter_task, device, uni_port.port_number,
- uni_port, _set_vlan_vid, tp_id)
+ retry = _STARTUP_RETRY_WAIT * (random.randint(1,5))
+ reactor.callLater(retry,
+ self._add_vlan_filter_task, device, uni_port.port_number,
+ uni_port, _set_vlan_vid, tp_id)
self.log.info('setting-vlan-tag')
self._vlan_filter_task = BrcmVlanFilterTask(self.omci_agent, self, uni_port, _set_vlan_vid, tp_id)
@@ -1227,11 +1230,24 @@
self._mib_download_task = None
yield self.onu_active_event()
+ # Start collecting stats from the device after a brief pause
+ if not self._pm_metrics_started:
+ self._pm_metrics_started = True
+ pmstart = _STARTUP_RETRY_WAIT * (random.randint(1, 5))
+ reactor.callLater(pmstart, self._pm_metrics.start_collector)
+
+ # Start test requests after a brief pause
+ if not self._test_request_started:
+ self._test_request_started = True
+ tststart = _STARTUP_RETRY_WAIT * (random.randint(1, 5))
+ reactor.callLater(tststart, self._test_request.start_collector)
+
@inlineCallbacks
def failure(_reason):
self.log.warn('mib-download-failure-retrying', _reason=_reason)
yield self.core_proxy.device_reason_update(self.device_id, 'initial-mib-download-failure-retrying')
- self._deferred = reactor.callLater(_STARTUP_RETRY_WAIT, self._mib_in_sync)
+ retry = _STARTUP_RETRY_WAIT * (random.randint(1,5))
+ reactor.callLater(retry, self._mib_in_sync)
# start by locking all the unis till mib sync and initial mib is downloaded
# this way we can capture the port down/up events when we are ready