VOL-1330: Limit the number of failed resync attempts.
Rather than trying to resync forever and leaving the onu
unusable, limit the tries to a configurable amount. Some onu
refuse to mib upload after being provisioned, and this prevents
an in-sync/restore in service if the olt or voltha is restarted
and the onu is not.
If the limit is hit force reset the onu and re-upload the
mib. A reset onu always allows mib upload.
This deals with onu that refuse to upload their mib after being
configured, and becoming disconnected/reconnected from the pon.
i.e fiber pull or olt restart and the onu did *not* restart.
Change-Id: I0451b9954980f36efd035fbfc381bdc328aa8571
diff --git a/pyvoltha/adapters/extensions/omci/state_machines/mib_sync.py b/pyvoltha/adapters/extensions/omci/state_machines/mib_sync.py
index 8d8d54c..d38cbaf 100644
--- a/pyvoltha/adapters/extensions/omci/state_machines/mib_sync.py
+++ b/pyvoltha/adapters/extensions/omci/state_machines/mib_sync.py
@@ -75,6 +75,7 @@
DEFAULT_TIMEOUT_RETRY = 5 # Seconds to delay after task failure/timeout
DEFAULT_AUDIT_DELAY = 60 # Periodic tick to audit the MIB Data Sync
DEFAULT_RESYNC_DELAY = 300 # Periodically force a resync
+ DEFAULT_RESYNC_FAIL_LIMIT = 5 # Number of times to try to resync an existing onu before force resetting.
def __init__(self, agent, device_id, mib_sync_tasks, db,
advertise_events=False,
@@ -83,7 +84,8 @@
initial_state='disabled',
timeout_delay=DEFAULT_TIMEOUT_RETRY,
audit_delay=DEFAULT_AUDIT_DELAY,
- resync_delay=DEFAULT_RESYNC_DELAY):
+ resync_delay=DEFAULT_RESYNC_DELAY,
+ resync_fail_limit=DEFAULT_RESYNC_FAIL_LIMIT):
"""
Class initialization
@@ -102,6 +104,9 @@
an audit manually by calling 'self.audit_mib'
:param resync_delay: (int) Seconds in sync before performing a forced MIB
resynchronization
+ :param resync_fail_limit: (int) Number of attempts at resynchronizing the onu
+ before giving up and reseting/re-uploading the mib.
+ Setting to 0 disables the limit allowing unlimited attempts.
"""
self.log = structlog.get_logger(device_id=device_id)
@@ -112,6 +117,7 @@
self._timeout_delay = timeout_delay
self._audit_delay = audit_delay
self._resync_delay = resync_delay
+ self._resync_fail_limit = resync_fail_limit
self._upload_task = mib_sync_tasks['mib-upload']
self._get_mds_task = mib_sync_tasks['get-mds']
@@ -127,6 +133,7 @@
self._last_mib_db_sync_value = None
self._device_in_db = False
self._next_resync = None
+ self._failed_resync_count = 0
self._on_olt_only_diffs = None
self._on_onu_only_diffs = None
@@ -354,6 +361,8 @@
# Determine if this ONU has ever synchronized
if self.is_new_onu:
+ # clear resync failure counter if we "started over"
+ self._failed_resync_count = 0
# Start full MIB upload
self._deferred = reactor.callLater(0, self.upload_mib)
@@ -546,6 +555,19 @@
def failure(reason):
self.log.info('resync-failure', reason=reason)
self._current_task = None
+
+ # if we continue to fail resync after configured number of times then give up
+ # and reset the onu, reupload the mib db and start over. Setting last_mib_db_sync_value
+ # to None forces the state machine to start over on calling timeout trigger
+ self._failed_resync_count += 1
+ if self._resync_fail_limit > 0 and self._failed_resync_count >= self._resync_fail_limit:
+ self.log.warn("resync-forcing-reset", attempt_count=self._failed_resync_count,
+ limit=self._resync_fail_limit)
+ self._last_mib_db_sync_value = None
+ else:
+ self.log.info("resync-attempt-count", attempt_count=self._failed_resync_count,
+ limit=self._resync_fail_limit)
+
self._deferred = reactor.callLater(self._timeout_delay, self.timeout)
self._current_task = self._resync_task(self._agent, self._device_id)