VOL-3876: openonu-go adapter goes on a continues loop to delete L2 PM counter if it fails to delete
- also resolve deadlock issue seen during pm config update scenario
- fix some logging
- update timeout value for omci get metrics
Change-Id: Ic27f47eb53bf4a3f7f8b49fc2c16f43ba62446a5
diff --git a/internal/pkg/onuadaptercore/device_handler.go b/internal/pkg/onuadaptercore/device_handler.go
index 27d04dc..556fe62 100644
--- a/internal/pkg/onuadaptercore/device_handler.go
+++ b/internal/pkg/onuadaptercore/device_handler.go
@@ -2752,21 +2752,21 @@
func (dh *deviceHandler) handleGlobalPmConfigUpdates(ctx context.Context, pmConfigs *voltha.PmConfigs) []error {
var err error
var errorsList []error
- logger.Infow(ctx, "handling-global-pm-config-params", log.Fields{"device-id": dh.device.Id})
+ logger.Infow(ctx, "handling-global-pm-config-params - start", log.Fields{"device-id": dh.device.Id})
if pmConfigs.DefaultFreq != dh.pmConfigs.DefaultFreq {
if err = dh.pOnuMetricsMgr.updateDefaultFrequency(ctx, pmConfigs); err != nil {
errorsList = append(errorsList, err)
}
}
-
+ logger.Infow(ctx, "handling-global-pm-config-params - done", log.Fields{"device-id": dh.device.Id})
return errorsList
}
func (dh *deviceHandler) handleGroupPmConfigUpdates(ctx context.Context, pmConfigs *voltha.PmConfigs) []error {
var err error
var errorsList []error
- logger.Debugw(ctx, "handling-group-pm-config-params", log.Fields{"device-id": dh.device.Id})
+ logger.Debugw(ctx, "handling-group-pm-config-params - start", log.Fields{"device-id": dh.device.Id})
// Check if group metric related config is updated
for _, v := range pmConfigs.Groups {
dh.pOnuMetricsMgr.onuMetricsManagerLock.RLock()
@@ -2784,13 +2784,14 @@
}
}
}
+ logger.Debugw(ctx, "handling-group-pm-config-params - done", log.Fields{"device-id": dh.device.Id})
return errorsList
}
func (dh *deviceHandler) handleStandalonePmConfigUpdates(ctx context.Context, pmConfigs *voltha.PmConfigs) []error {
var err error
var errorsList []error
- logger.Debugw(ctx, "handling-individual-pm-config-params", log.Fields{"device-id": dh.device.Id})
+ logger.Debugw(ctx, "handling-individual-pm-config-params - start", log.Fields{"device-id": dh.device.Id})
// Check if standalone metric related config is updated
for _, v := range pmConfigs.Metrics {
dh.pOnuMetricsMgr.onuMetricsManagerLock.RLock()
@@ -2808,6 +2809,7 @@
}
}
}
+ logger.Debugw(ctx, "handling-individual-pm-config-params - done", log.Fields{"device-id": dh.device.Id})
return errorsList
}
diff --git a/internal/pkg/onuadaptercore/omci_cc.go b/internal/pkg/onuadaptercore/omci_cc.go
index 4776796..dcdbe1b 100644
--- a/internal/pkg/onuadaptercore/omci_cc.go
+++ b/internal/pkg/onuadaptercore/omci_cc.go
@@ -45,7 +45,7 @@
// ### OMCI related definitions - retrieved from Python adapter code/trace ####
//ConstDefaultOmciTimeout - Default OMCI Timeout
-const ConstDefaultOmciTimeout = 10 // ( 3 ?) Seconds
+const ConstDefaultOmciTimeout = 3
const galEthernetEID = uint16(1)
const maxGemPayloadSize = uint16(48)
diff --git a/internal/pkg/onuadaptercore/onu_metrics_manager.go b/internal/pkg/onuadaptercore/onu_metrics_manager.go
index cd83725..747ba8a 100644
--- a/internal/pkg/onuadaptercore/onu_metrics_manager.go
+++ b/internal/pkg/onuadaptercore/onu_metrics_manager.go
@@ -172,6 +172,7 @@
L2PmCollectionInterval = 15 * 60 // Unit in seconds. Do not change this as this fixed by OMCI specification for L2 PM counters
SyncTimeRetryInterval = 15 // Unit seconds
L2PmCreateAttempts = 3
+ L2PmDeleteAttempts = 3
L2PmCollectAttempts = 3
// Per Table 11.2.9-1 – OMCI baseline message limitations in G.988 spec, the max GET Response
// payload size is 25. We define 24 (one less) to allow for dynamic insertion of IntervalEndTime
@@ -221,6 +222,7 @@
isL2PMCounter bool // true for only L2 PM counters
collectAttempts uint32 // number of attempts to collect L2 PM data
createRetryAttempts uint32 // number of attempts to try creating the L2 PM ME
+ deleteRetryAttempts uint32 // number of attempts to try deleting the L2 PM ME
}
type standaloneMetric struct {
@@ -372,6 +374,7 @@
v.nextCollectionInterval = time.Now().Add(time.Duration(newGroupFreq) * time.Second)
updated = true
logger.Infow(ctx, "group frequency updated", log.Fields{"device-id": mm.pDeviceHandler.deviceID, "newGroupFreq": newGroupFreq, "groupName": aGroupName})
+ break
}
}
if !updated {
@@ -415,6 +418,7 @@
v.nextCollectionInterval = time.Now().Add(time.Duration(newMetricFreq) * time.Second)
updated = true
logger.Infow(ctx, "metric frequency updated", log.Fields{"device-id": mm.pDeviceHandler.deviceID, "newMetricFreq": newMetricFreq, "aMetricName": aMetricName})
+ break
}
}
if !updated {
@@ -456,7 +460,7 @@
// unlike other L2 PM counters. We need to fetch the active gemport instance IDs in the system to
// take further action
if v.groupName == GemPortHistoryName {
- mm.updateGemPortNTPInstanceToAddForPerfMonitoring()
+ mm.updateGemPortNTPInstanceToAddForPerfMonitoring(ctx)
}
} else if mm.pDeviceHandler.pmConfigs.FreqOverride { // otherwise just update the next collection interval
v.nextCollectionInterval = time.Now().Add(time.Duration(v.frequency) * time.Second)
@@ -472,7 +476,7 @@
// unlike other L2 PM counters. We need to fetch the active gemport instance IDs in the system to
// take further action
if v.groupName == GemPortHistoryName {
- mm.updateGemPortNTPInstanceToDeleteForPerfMonitoring()
+ mm.updateGemPortNTPInstanceToDeleteForPerfMonitoring(ctx)
}
}
}
@@ -522,6 +526,7 @@
}
updated = true
logger.Infow(ctx, "standalone metric support updated", log.Fields{"device-id": mm.pDeviceHandler.deviceID, "metricName": aMetricName, "enabled": metric.Enabled})
+ break
}
}
if !updated {
@@ -631,7 +636,7 @@
case meAttributes = <-mm.opticalMetricsChan:
logger.Debugw(ctx, "received optical metrics", log.Fields{"device-id": mm.pDeviceHandler.deviceID})
case <-time.After(time.Duration(ConstDefaultOmciTimeout) * time.Second):
- logger.Errorw(ctx, "timeout waiting for omci-get response for uni status", log.Fields{"device-id": mm.pDeviceHandler.deviceID})
+ logger.Errorw(ctx, "timeout waiting for omci-get response for optical metrics", log.Fields{"device-id": mm.pDeviceHandler.deviceID})
// The metrics will be empty in this case
break loop
}
@@ -1286,6 +1291,7 @@
if mm.groupMetricMap[n].createRetryAttempts > L2PmCreateAttempts {
logger.Debugw(ctx, "exceeded-max-add-retry-attempts--disabling-group", log.Fields{"groupName": n})
mm.groupMetricMap[n].enabled = false
+ mm.groupMetricMap[n].createRetryAttempts = 0 // reset counter
mm.l2PmToAdd = mm.removeIfFoundString(mm.l2PmToAdd, n)
}
@@ -1388,8 +1394,19 @@
mm.onuMetricsManagerLock.Unlock()
} else {
logger.Warnw(ctx, "state delete pm - failed to delete pm",
- log.Fields{"device-id": mm.pDeviceHandler.deviceID, "metricName": n})
- // Nothing we can do about this.
+ log.Fields{"device-id": mm.pDeviceHandler.deviceID, "metricName": n,
+ "deleteRetryAttempts": mm.groupMetricMap[n].deleteRetryAttempts,
+ "active-l2-pms": mm.activeL2Pms, "pms-to-delete": mm.l2PmToDelete})
+ // If deleteRetryAttempts exceeds L2PmDeleteAttempts then just give up
+ mm.onuMetricsManagerLock.Lock()
+ mm.groupMetricMap[n].deleteRetryAttempts++
+ if mm.groupMetricMap[n].deleteRetryAttempts > L2PmDeleteAttempts {
+ logger.Debugw(ctx, "exceeded-max-delete-retry-attempts--disabling-group", log.Fields{"groupName": n})
+ mm.activeL2Pms = mm.removeIfFoundString(mm.activeL2Pms, n)
+ mm.l2PmToDelete = mm.removeIfFoundString(mm.l2PmToDelete, n)
+ mm.groupMetricMap[n].deleteRetryAttempts = 0 // reset counter
+ }
+ mm.onuMetricsManagerLock.Unlock()
}
}
logger.Debugw(ctx, "state delete pm - done", log.Fields{"device-id": mm.pDeviceHandler.deviceID, "active-l2-pms": mm.activeL2Pms, "pms-to-delete": mm.l2PmToDelete})
@@ -2052,11 +2069,11 @@
logger.Debugw(ctx, "waitForResponseOrTimeout", log.Fields{"create": create, "instID": instID, "meClassName": meClassName})
select {
case resp := <-mm.l2PmCreateOrDeleteResponseChan:
- logger.Debugw(ctx, "received create l2 pm me response",
+ logger.Debugw(ctx, "received l2 pm me response",
log.Fields{"device-id": mm.pDeviceHandler.deviceID, "resp": resp, "create": create, "meClassName": meClassName, "instID": instID})
return resp
case <-time.After(time.Duration(ConstDefaultOmciTimeout) * time.Second):
- logger.Errorw(ctx, "timeout waiting for create l2 pm me response",
+ logger.Errorw(ctx, "timeout waiting for l2 pm me response",
log.Fields{"device-id": mm.pDeviceHandler.deviceID, "resp": false, "create": create, "meClassName": meClassName, "instID": instID})
}
return false
@@ -2222,31 +2239,33 @@
// gemPortNCTPPerfHistInstToAdd slice
}
-func (mm *onuMetricsManager) updateGemPortNTPInstanceToAddForPerfMonitoring() {
+func (mm *onuMetricsManager) updateGemPortNTPInstanceToAddForPerfMonitoring(ctx context.Context) {
if mm.pDeviceHandler.pOnuTP != nil {
gemPortInstIDs := mm.pDeviceHandler.pOnuTP.GetAllBidirectionalGemPortIDsForOnu()
- mm.onuMetricsManagerLock.Lock()
- defer mm.onuMetricsManagerLock.Unlock()
+ // NOTE: It is expected that caller of this function has acquired the required mutex for synchronization purposes
for _, v := range gemPortInstIDs {
// mark the instance for addition
mm.gemPortNCTPPerfHistInstToAdd = mm.appendIfMissingUnt16(mm.gemPortNCTPPerfHistInstToAdd, v)
// If the instance presence toggles too soon, we need to remove it from gemPortNCTPPerfHistInstToDelete slice
mm.gemPortNCTPPerfHistInstToDelete = mm.removeIfFoundUint16(mm.gemPortNCTPPerfHistInstToDelete, v)
}
+ logger.Debugw(ctx, "updateGemPortNTPInstanceToAddForPerfMonitoring",
+ log.Fields{"deviceID": mm.pDeviceHandler.deviceID, "gemToAdd": mm.gemPortNCTPPerfHistInstToAdd, "gemToDel": mm.gemPortNCTPPerfHistInstToDelete})
}
}
-func (mm *onuMetricsManager) updateGemPortNTPInstanceToDeleteForPerfMonitoring() {
+func (mm *onuMetricsManager) updateGemPortNTPInstanceToDeleteForPerfMonitoring(ctx context.Context) {
if mm.pDeviceHandler.pOnuTP != nil {
gemPortInstIDs := mm.pDeviceHandler.pOnuTP.GetAllBidirectionalGemPortIDsForOnu()
- mm.onuMetricsManagerLock.Lock()
- defer mm.onuMetricsManagerLock.Unlock()
+ // NOTE: It is expected that caller of this function has acquired the required mutex for synchronization purposes
for _, v := range gemPortInstIDs {
mm.gemPortNCTPPerfHistInstToDelete = mm.appendIfMissingUnt16(mm.gemPortNCTPPerfHistInstToDelete, v)
// If the instance presence toggles too soon, we need to remove it from gemPortNCTPPerfHistInstToAdd slice
mm.gemPortNCTPPerfHistInstToAdd = mm.removeIfFoundUint16(mm.gemPortNCTPPerfHistInstToAdd, v)
}
}
+ logger.Debugw(ctx, "updateGemPortNTPInstanceToDeleteForPerfMonitoring",
+ log.Fields{"deviceID": mm.pDeviceHandler.deviceID, "gemToAdd": mm.gemPortNCTPPerfHistInstToAdd, "gemToDel": mm.gemPortNCTPPerfHistInstToDelete})
}
func (mm *onuMetricsManager) appendIfMissingString(slice []string, n string) []string {