[VOL-4299] R2.8 - OpenOnuAdapter ATT scenario Flow suspension on cookie usage runs into deadlock
Signed-off-by: mpagenko <michael.pagenkopf@adtran.com>
Change-Id: I6348ea5ef51c93bc0a502fd2f81f07737258ba64
diff --git a/VERSION b/VERSION
index 0c00f61..17e63e7 100755
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.3.10
+1.3.11
diff --git a/internal/pkg/onuadaptercore/device_handler.go b/internal/pkg/onuadaptercore/device_handler.go
index edb44ce..4fb0a66 100644
--- a/internal/pkg/onuadaptercore/device_handler.go
+++ b/internal/pkg/onuadaptercore/device_handler.go
@@ -221,6 +221,7 @@
uniEntityMap map[uint32]*onuUniPort
mutexKvStoreContext sync.Mutex
lockVlanConfig sync.RWMutex
+ lockVlanAdd sync.RWMutex
UniVlanConfigFsmMap map[uint8]*UniVlanConfigFsm
lockUpgradeFsm sync.RWMutex
pOnuUpradeFsm *OnuUpgradeFsm
@@ -261,6 +262,7 @@
//TODO initialize the support classes.
dh.uniEntityMap = make(map[uint32]*onuUniPort)
dh.lockVlanConfig = sync.RWMutex{}
+ dh.lockVlanAdd = sync.RWMutex{}
dh.lockUpgradeFsm = sync.RWMutex{}
dh.UniVlanConfigFsmMap = make(map[uint8]*UniVlanConfigFsm)
dh.reconciling = cNoReconciling
@@ -3026,25 +3028,32 @@
logger.Debugw(ctx, "flow-add vlan-set", log.Fields{"device-id": dh.deviceID})
}
- //mutex protection as the update_flow rpc maybe running concurrently for different flows, perhaps also activities
- // must be set including the execution of createVlanFilterFsm() to avoid unintended creation of FSM's
- // when different rules are requested concurrently for the same uni
- // (also vlan persistency data does not support multiple FSM's on the same UNI correctly!)
- dh.lockVlanConfig.Lock()
- logger.Debugw(ctx, "flow-add got lock", log.Fields{"device-id": dh.deviceID, "tpID": loTpID, "uniID": apUniPort.uniID})
var meter *voltha.OfpMeterConfig
if apFlowMetaData != nil {
meter = apFlowMetaData.Meters[0]
}
+ //mutex protection as the update_flow rpc maybe running concurrently for different flows, perhaps also activities
+ // must be set including the execution of createVlanFilterFsm() to avoid unintended creation of FSM's
+ // when different rules are requested concurrently for the same uni
+ // (also vlan persistency data does not support multiple FSM's on the same UNI correctly!)
+ dh.lockVlanAdd.Lock() //prevent multiple add activities to start in parallel
+ dh.lockVlanConfig.RLock() //read protection on UniVlanConfigFsmMap (removeFlowItemFromUniPort)
+ logger.Debugw(ctx, "flow-add got lock", log.Fields{"device-id": dh.deviceID, "tpID": loTpID, "uniID": apUniPort.uniID})
if _, exist := dh.UniVlanConfigFsmMap[apUniPort.uniID]; exist {
+ //SetUniFlowParams() may block on some rule that is suspended-to-add
+ // in order to allow for according flow removal lockVlanConfig may only be used with RLock here
err := dh.UniVlanConfigFsmMap[apUniPort.uniID].SetUniFlowParams(ctx, loTpID, loCookieSlice,
loMatchVlan, loSetVlan, loSetPcp, false, meter)
- dh.lockVlanConfig.Unlock()
+ dh.lockVlanConfig.RUnlock()
+ dh.lockVlanAdd.Unlock() //re-admit new Add-flow-processing
return err
}
+ dh.lockVlanConfig.RUnlock()
+ dh.lockVlanConfig.Lock() //createVlanFilterFsm should always be a non-blocking operation and requires r+w lock
err := dh.createVlanFilterFsm(ctx, apUniPort, loTpID, loCookieSlice,
loMatchVlan, loSetVlan, loSetPcp, OmciVlanFilterAddDone, false, meter)
dh.lockVlanConfig.Unlock()
+ dh.lockVlanAdd.Unlock() //re-admit new Add-flow-processing
return err
}
@@ -3080,6 +3089,7 @@
//mutex protection as the update_flow rpc maybe running concurrently for different flows, perhaps also activities
dh.lockVlanConfig.RLock()
defer dh.lockVlanConfig.RUnlock()
+ logger.Debugw(ctx, "flow-remove got RLock", log.Fields{"device-id": dh.deviceID, "uniID": apUniPort.uniID})
if _, exist := dh.UniVlanConfigFsmMap[apUniPort.uniID]; exist {
return dh.UniVlanConfigFsmMap[apUniPort.uniID].RemoveUniFlowParams(ctx, loCookie)
}
diff --git a/internal/pkg/onuadaptercore/omci_vlan_config.go b/internal/pkg/onuadaptercore/omci_vlan_config.go
index 26ecac0..3201150 100644
--- a/internal/pkg/onuadaptercore/omci_vlan_config.go
+++ b/internal/pkg/onuadaptercore/omci_vlan_config.go
@@ -496,7 +496,11 @@
//a delay for adding the cookie to this rule is requested
// take care of the mutex which is already locked here, need to unlock/lock accordingly to prevent deadlock in suspension
oFsm.mutexFlowParams.Unlock()
- oFsm.suspendNewRule(ctx)
+ if deleteSuccess := oFsm.suspendNewRule(ctx); !deleteSuccess {
+ logger.Errorw(ctx, "UniVlanConfigFsm suspended add-cookie-to-rule aborted", log.Fields{
+ "device-id": oFsm.deviceID, "cookie": delayedCookie})
+ return fmt.Errorf(" UniVlanConfigFsm suspended add-cookie-to-rule aborted %s", oFsm.deviceID)
+ }
flowCookieModify, requestAppendRule = oFsm.reviseFlowConstellation(ctx, delayedCookie, loRuleParams)
oFsm.mutexFlowParams.Lock()
} else {
@@ -515,7 +519,12 @@
oFsm.mutexFlowParams.Unlock()
if !flowEntryMatch { //it is (was) a new rule
- delayedCookie := oFsm.suspendIfRequiredNewRule(ctx, aCookieSlice)
+ delayedCookie, deleteSuccess := oFsm.suspendIfRequiredNewRule(ctx, aCookieSlice)
+ if !deleteSuccess {
+ logger.Errorw(ctx, "UniVlanConfigFsm suspended add-new-rule aborted", log.Fields{
+ "device-id": oFsm.deviceID, "cookie": delayedCookie})
+ return fmt.Errorf(" UniVlanConfigFsm suspended add-new-rule aborted %s", oFsm.deviceID)
+ }
requestAppendRule = true //default assumption here is that rule is to be appended
flowCookieModify = true //and that the the flow data base is to be updated
if delayedCookie != 0 { //it was suspended
@@ -713,15 +722,16 @@
}
return 0 //no delay requested
}
-func (oFsm *UniVlanConfigFsm) suspendNewRule(ctx context.Context) {
+func (oFsm *UniVlanConfigFsm) suspendNewRule(ctx context.Context) bool {
oFsm.mutexFlowParams.RLock()
logger.Infow(ctx, "Need to suspend adding this rule as long as the cookie is still connected to some other rule", log.Fields{
"device-id": oFsm.deviceID, "cookie": oFsm.delayNewRuleCookie})
oFsm.mutexFlowParams.RUnlock()
+ cookieDeleted := true //default assumption also for timeout (just try to continue as if removed)
select {
- case <-oFsm.chCookieDeleted:
- logger.Infow(ctx, "resume adding this rule after having deleted cookie in some other rule", log.Fields{
- "device-id": oFsm.deviceID, "cookie": oFsm.delayNewRuleCookie})
+ case cookieDeleted = <-oFsm.chCookieDeleted:
+ logger.Infow(ctx, "resume adding this rule after having deleted cookie in some other rule or abort", log.Fields{
+ "device-id": oFsm.deviceID, "cookie": oFsm.delayNewRuleCookie, "deleted": cookieDeleted})
case <-time.After(oFsm.pOmciCC.GetMaxOmciTimeoutWithRetries() * time.Second):
logger.Errorw(ctx, "timeout waiting for deletion of cookie in some other rule, just try to continue", log.Fields{
"device-id": oFsm.deviceID, "cookie": oFsm.delayNewRuleCookie})
@@ -729,16 +739,18 @@
oFsm.mutexFlowParams.Lock()
oFsm.delayNewRuleCookie = 0
oFsm.mutexFlowParams.Unlock()
+ return cookieDeleted
}
-func (oFsm *UniVlanConfigFsm) suspendIfRequiredNewRule(ctx context.Context, aCookieSlice []uint64) uint64 {
+func (oFsm *UniVlanConfigFsm) suspendIfRequiredNewRule(ctx context.Context, aCookieSlice []uint64) (uint64, bool) {
oFsm.mutexFlowParams.Lock()
delayedCookie := oFsm.delayNewRuleForCookie(ctx, aCookieSlice)
oFsm.mutexFlowParams.Unlock()
+ deleteSuccess := true
if delayedCookie != 0 {
- oFsm.suspendNewRule(ctx)
+ deleteSuccess = oFsm.suspendNewRule(ctx)
}
- return delayedCookie
+ return delayedCookie, deleteSuccess
}
//returns flowModified, RuleAppendRequest
@@ -1793,7 +1805,7 @@
if oFsm.delayNewRuleCookie != 0 {
// looks like the waiting AddFlow is stuck
oFsm.mutexFlowParams.RUnlock()
- oFsm.chCookieDeleted <- true // let the waiting AddFlow thread continue/terminate
+ oFsm.chCookieDeleted <- false // let the waiting AddFlow thread terminate
oFsm.mutexFlowParams.RLock()
}
if len(oFsm.uniRemoveFlowsSlice) > 0 {