[VOL-4774] openonuAdapterGo: Panic during scale test
Change-Id: I983eeed5d63b5a778bfc22485d319a55acdbab46
diff --git a/internal/pkg/common/defines.go b/internal/pkg/common/defines.go
index c2eecbd..9cb4ba8 100755
--- a/internal/pkg/common/defines.go
+++ b/internal/pkg/common/defines.go
@@ -361,6 +361,12 @@
OnuOmciCommunicationFailureConfig = "ONU_OMCI_COMMUNICATION_FAILURE_CONFIG"
OnuOmciCommunicationFailureConfigDesc = "OMCI communication during ONU configuration failed"
+ OnuOmciCommunicationAbortConfig = "ONU_OMCI_COMMUNICATION_ABORT_CONFIG"
+ OnuOmciCommunicationAbortConfigDesc = "OMCI communication during ONU configuration aborted - max failures reached: stopping device"
+
OnuOmciCommunicationFailureSwUpgrade = "ONU_OMCI_COMMUNICATION_FAILURE_SW_UPGRADE"
OnuOmciCommunicationFailureSwUpgradeDesc = "OMCI communication during ONU SW upgrade failed"
+
+ OnuOmciCommunicationAbortSwUpgrade = "ONU_OMCI_COMMUNICATION_ABORT_SW_UPGRADE"
+ OnuOmciCommunicationAbortSwUpgradeDesc = "OMCI communication during ONU SW upgrade aborted - max failures reached: stopping device"
)
diff --git a/internal/pkg/common/interfaces.go b/internal/pkg/common/interfaces.go
index 98f644c..69fa26f 100755
--- a/internal/pkg/common/interfaces.go
+++ b/internal/pkg/common/interfaces.go
@@ -124,6 +124,8 @@
CreatePortInCore(context.Context, *voltha.Port) error
PerOnuFlowHandlerRoutine(uniID uint8)
+
+ UpdateInterface(context.Context) error
}
// IonuDeviceEntry interface to onuDeviceEntry
diff --git a/internal/pkg/common/omci_cc.go b/internal/pkg/common/omci_cc.go
index 45ee429..08ee189 100755
--- a/internal/pkg/common/omci_cc.go
+++ b/internal/pkg/common/omci_cc.go
@@ -72,6 +72,8 @@
// CDefaultRetries - TODO: add comment
const CDefaultRetries = 2
+const cMaxConsecutiveOmciTimeouts = 3
+
// ### OMCI related definitions - end
//CallbackPairEntry to be used for OMCI send/receive correlation
@@ -121,15 +123,19 @@
UploadSequNo uint16
UploadNoOfCmds uint16
- mutexSendQueuedRequests sync.Mutex
- mutexLowPrioTxQueue sync.Mutex
- lowPrioTxQueue *list.List
- mutexHighPrioTxQueue sync.Mutex
- highPrioTxQueue *list.List
- mutexRxSchedMap sync.Mutex
- rxSchedulerMap map[uint16]CallbackPairEntry
- mutexMonReq sync.RWMutex
- monitoredRequests map[uint16]OmciTransferStructure
+ mutexSendQueuedRequests sync.Mutex
+ mutexLowPrioTxQueue sync.Mutex
+ lowPrioTxQueue *list.List
+ mutexHighPrioTxQueue sync.Mutex
+ highPrioTxQueue *list.List
+ mutexRxSchedMap sync.Mutex
+ rxSchedulerMap map[uint16]CallbackPairEntry
+ mutexMonReq sync.RWMutex
+ monitoredRequests map[uint16]OmciTransferStructure
+ mutexConsecutiveOmciTimeouts sync.RWMutex
+ consecutiveOmciTimeouts uint8
+ mutexOmciAbortInProgress sync.RWMutex
+ omciAbortInProgress bool
}
var responsesWithMibDataSync = []omci.MessageType{
@@ -170,7 +176,8 @@
omciCC.highPrioTxQueue = list.New()
omciCC.rxSchedulerMap = make(map[uint16]CallbackPairEntry)
omciCC.monitoredRequests = make(map[uint16]OmciTransferStructure)
-
+ omciCC.consecutiveOmciTimeouts = 0
+ omciCC.omciAbortInProgress = false
return &omciCC
}
@@ -400,6 +407,12 @@
oo.mutexRxSchedMap.Lock()
rxCallbackEntry, ok := oo.rxSchedulerMap[omciMsg.TransactionID]
if ok && rxCallbackEntry.CbFunction != nil {
+
+ // valid OMCI Response Message received - reset counter of consecutive OMCI timeouts
+ oo.mutexConsecutiveOmciTimeouts.Lock()
+ oo.consecutiveOmciTimeouts = 0
+ oo.mutexConsecutiveOmciTimeouts.Unlock()
+
if rxCallbackEntry.FramePrint {
oo.printRxMessage(ctx, rxMsg)
}
@@ -4392,6 +4405,31 @@
logger.Errorw(ctx, "reqMon: timeout waiting for response - no of max retries reached - send ONU device event!",
log.Fields{"tid": tid, "retries": retryCounter, "device-id": oo.deviceID})
oo.pOnuDeviceEntry.SendOnuDeviceEvent(ctx, OnuOmciCommunicationFailureSwUpgrade, OnuOmciCommunicationFailureSwUpgradeDesc)
+ oo.mutexConsecutiveOmciTimeouts.Lock()
+ if oo.consecutiveOmciTimeouts < cMaxConsecutiveOmciTimeouts {
+ oo.consecutiveOmciTimeouts++
+ oo.mutexConsecutiveOmciTimeouts.Unlock()
+ } else {
+ oo.consecutiveOmciTimeouts = 0
+ oo.mutexConsecutiveOmciTimeouts.Unlock()
+ oo.mutexOmciAbortInProgress.Lock()
+ if !oo.omciAbortInProgress {
+ oo.omciAbortInProgress = true
+ oo.mutexOmciAbortInProgress.Unlock()
+ logger.Errorw(ctx, "reqMon: communication aborted - no of max consecutive timeouts reached - stopping device and send ONU device event!",
+ log.Fields{"tid": tid, "device-id": oo.deviceID})
+ oo.pOnuDeviceEntry.SendOnuDeviceEvent(ctx, OnuOmciCommunicationAbortSwUpgrade, OnuOmciCommunicationAbortSwUpgradeDesc)
+ // stop all running FSM processing
+ _ = oo.pBaseDeviceHandler.UpdateInterface(ctx)
+ oo.mutexOmciAbortInProgress.Lock()
+ oo.omciAbortInProgress = false
+ oo.mutexOmciAbortInProgress.Unlock()
+ } else {
+ oo.mutexOmciAbortInProgress.Unlock()
+ logger.Infow(ctx, "reqMon: communication aborted - corresponding processing already running",
+ log.Fields{"tid": tid, "device-id": oo.deviceID})
+ }
+ }
break loop
} else {
logger.Infow(ctx, "reqMon: timeout waiting for response - retry",
@@ -4879,6 +4917,31 @@
logger.Errorw(ctx, "reqMon: timeout waiting for response - no of max retries reached - send ONU device event!",
log.Fields{"tid": tid, "retries": retryCounter, "device-id": oo.deviceID})
oo.pOnuDeviceEntry.SendOnuDeviceEvent(ctx, OnuOmciCommunicationFailureConfig, OnuOmciCommunicationFailureConfigDesc)
+ oo.mutexConsecutiveOmciTimeouts.Lock()
+ if oo.consecutiveOmciTimeouts < cMaxConsecutiveOmciTimeouts {
+ oo.consecutiveOmciTimeouts++
+ oo.mutexConsecutiveOmciTimeouts.Unlock()
+ } else {
+ oo.consecutiveOmciTimeouts = 0
+ oo.mutexConsecutiveOmciTimeouts.Unlock()
+ oo.mutexOmciAbortInProgress.Lock()
+ if !oo.omciAbortInProgress {
+ oo.omciAbortInProgress = true
+ oo.mutexOmciAbortInProgress.Unlock()
+ logger.Errorw(ctx, "reqMon: communication aborted - no of max consecutive timeouts reached - stopping device and send ONU device event!",
+ log.Fields{"tid": tid, "device-id": oo.deviceID})
+ oo.pOnuDeviceEntry.SendOnuDeviceEvent(ctx, OnuOmciCommunicationAbortConfig, OnuOmciCommunicationAbortConfigDesc)
+ // stop all running FSM processing
+ _ = oo.pBaseDeviceHandler.UpdateInterface(ctx)
+ oo.mutexOmciAbortInProgress.Lock()
+ oo.omciAbortInProgress = false
+ oo.mutexOmciAbortInProgress.Unlock()
+ } else {
+ oo.mutexOmciAbortInProgress.Unlock()
+ logger.Infow(ctx, "reqMon: communication aborted - corresponding processing already running",
+ log.Fields{"tid": tid, "device-id": oo.deviceID})
+ }
+ }
break loop
} else {
logger.Infow(ctx, "reqMon: timeout waiting for response - retry",
diff --git a/internal/pkg/core/device_handler.go b/internal/pkg/core/device_handler.go
index af7fef3..53db879 100755
--- a/internal/pkg/core/device_handler.go
+++ b/internal/pkg/core/device_handler.go
@@ -2131,7 +2131,7 @@
return nil
}
-func (dh *deviceHandler) updateInterface(ctx context.Context, onuind *oop.OnuIndication) error {
+func (dh *deviceHandler) UpdateInterface(ctx context.Context) error {
//state checking to prevent unneeded processing (eg. on ONU 'unreachable' and 'down')
// (but note that the deviceReason may also have changed to e.g. TechProf*Delete_Success in between)
if dh.getDeviceReason() != cmn.DrStoppingOpenomci {
diff --git a/internal/pkg/core/openonu.go b/internal/pkg/core/openonu.go
index 96c13aa..d611a87 100755
--- a/internal/pkg/core/openonu.go
+++ b/internal/pkg/core/openonu.go
@@ -898,7 +898,7 @@
}
return &empty.Empty{}, nil
} else if (onuOperstate == "down") || (onuOperstate == "unreachable") {
- if err := handler.updateInterface(ctx, onuIndication); err != nil {
+ if err := handler.UpdateInterface(ctx); err != nil {
return nil, err
}
return &empty.Empty{}, nil