[VOL-4299] R2.8 - OpenOnuAdapter ATT scenario Flow suspension on cookie usage runs into deadlock

Signed-off-by: mpagenko <michael.pagenkopf@adtran.com>
Change-Id: I6348ea5ef51c93bc0a502fd2f81f07737258ba64
diff --git a/VERSION b/VERSION
index 0c00f61..17e63e7 100755
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.3.10
+1.3.11
diff --git a/internal/pkg/onuadaptercore/device_handler.go b/internal/pkg/onuadaptercore/device_handler.go
index edb44ce..4fb0a66 100644
--- a/internal/pkg/onuadaptercore/device_handler.go
+++ b/internal/pkg/onuadaptercore/device_handler.go
@@ -221,6 +221,7 @@
 	uniEntityMap                map[uint32]*onuUniPort
 	mutexKvStoreContext         sync.Mutex
 	lockVlanConfig              sync.RWMutex
+	lockVlanAdd                 sync.RWMutex
 	UniVlanConfigFsmMap         map[uint8]*UniVlanConfigFsm
 	lockUpgradeFsm              sync.RWMutex
 	pOnuUpradeFsm               *OnuUpgradeFsm
@@ -261,6 +262,7 @@
 	//TODO initialize the support classes.
 	dh.uniEntityMap = make(map[uint32]*onuUniPort)
 	dh.lockVlanConfig = sync.RWMutex{}
+	dh.lockVlanAdd = sync.RWMutex{}
 	dh.lockUpgradeFsm = sync.RWMutex{}
 	dh.UniVlanConfigFsmMap = make(map[uint8]*UniVlanConfigFsm)
 	dh.reconciling = cNoReconciling
@@ -3026,25 +3028,32 @@
 		logger.Debugw(ctx, "flow-add vlan-set", log.Fields{"device-id": dh.deviceID})
 	}
 
-	//mutex protection as the update_flow rpc maybe running concurrently for different flows, perhaps also activities
-	//  must be set including the execution of createVlanFilterFsm() to avoid unintended creation of FSM's
-	//  when different rules are requested concurrently for the same uni
-	//  (also vlan persistency data does not support multiple FSM's on the same UNI correctly!)
-	dh.lockVlanConfig.Lock()
-	logger.Debugw(ctx, "flow-add got lock", log.Fields{"device-id": dh.deviceID, "tpID": loTpID, "uniID": apUniPort.uniID})
 	var meter *voltha.OfpMeterConfig
 	if apFlowMetaData != nil {
 		meter = apFlowMetaData.Meters[0]
 	}
+	//mutex protection as the update_flow rpc maybe running concurrently for different flows, perhaps also activities
+	//  must be set including the execution of createVlanFilterFsm() to avoid unintended creation of FSM's
+	//  when different rules are requested concurrently for the same uni
+	//  (also vlan persistency data does not support multiple FSM's on the same UNI correctly!)
+	dh.lockVlanAdd.Lock()     //prevent multiple add activities to start in parallel
+	dh.lockVlanConfig.RLock() //read protection on UniVlanConfigFsmMap (removeFlowItemFromUniPort)
+	logger.Debugw(ctx, "flow-add got lock", log.Fields{"device-id": dh.deviceID, "tpID": loTpID, "uniID": apUniPort.uniID})
 	if _, exist := dh.UniVlanConfigFsmMap[apUniPort.uniID]; exist {
+		//SetUniFlowParams() may block on some rule that is suspended-to-add
+		//  in order to allow for according flow removal lockVlanConfig may only be used with RLock here
 		err := dh.UniVlanConfigFsmMap[apUniPort.uniID].SetUniFlowParams(ctx, loTpID, loCookieSlice,
 			loMatchVlan, loSetVlan, loSetPcp, false, meter)
-		dh.lockVlanConfig.Unlock()
+		dh.lockVlanConfig.RUnlock()
+		dh.lockVlanAdd.Unlock() //re-admit new Add-flow-processing
 		return err
 	}
+	dh.lockVlanConfig.RUnlock()
+	dh.lockVlanConfig.Lock() //createVlanFilterFsm should always be a non-blocking operation and requires r+w lock
 	err := dh.createVlanFilterFsm(ctx, apUniPort, loTpID, loCookieSlice,
 		loMatchVlan, loSetVlan, loSetPcp, OmciVlanFilterAddDone, false, meter)
 	dh.lockVlanConfig.Unlock()
+	dh.lockVlanAdd.Unlock() //re-admit new Add-flow-processing
 	return err
 }
 
@@ -3080,6 +3089,7 @@
 	//mutex protection as the update_flow rpc maybe running concurrently for different flows, perhaps also activities
 	dh.lockVlanConfig.RLock()
 	defer dh.lockVlanConfig.RUnlock()
+	logger.Debugw(ctx, "flow-remove got RLock", log.Fields{"device-id": dh.deviceID, "uniID": apUniPort.uniID})
 	if _, exist := dh.UniVlanConfigFsmMap[apUniPort.uniID]; exist {
 		return dh.UniVlanConfigFsmMap[apUniPort.uniID].RemoveUniFlowParams(ctx, loCookie)
 	}
diff --git a/internal/pkg/onuadaptercore/omci_vlan_config.go b/internal/pkg/onuadaptercore/omci_vlan_config.go
index 26ecac0..3201150 100644
--- a/internal/pkg/onuadaptercore/omci_vlan_config.go
+++ b/internal/pkg/onuadaptercore/omci_vlan_config.go
@@ -496,7 +496,11 @@
 						//a delay for adding the cookie to this rule is requested
 						// take care of the mutex which is already locked here, need to unlock/lock accordingly to prevent deadlock in suspension
 						oFsm.mutexFlowParams.Unlock()
-						oFsm.suspendNewRule(ctx)
+						if deleteSuccess := oFsm.suspendNewRule(ctx); !deleteSuccess {
+							logger.Errorw(ctx, "UniVlanConfigFsm suspended add-cookie-to-rule aborted", log.Fields{
+								"device-id": oFsm.deviceID, "cookie": delayedCookie})
+							return fmt.Errorf(" UniVlanConfigFsm suspended add-cookie-to-rule aborted %s", oFsm.deviceID)
+						}
 						flowCookieModify, requestAppendRule = oFsm.reviseFlowConstellation(ctx, delayedCookie, loRuleParams)
 						oFsm.mutexFlowParams.Lock()
 					} else {
@@ -515,7 +519,12 @@
 	oFsm.mutexFlowParams.Unlock()
 
 	if !flowEntryMatch { //it is (was) a new rule
-		delayedCookie := oFsm.suspendIfRequiredNewRule(ctx, aCookieSlice)
+		delayedCookie, deleteSuccess := oFsm.suspendIfRequiredNewRule(ctx, aCookieSlice)
+		if !deleteSuccess {
+			logger.Errorw(ctx, "UniVlanConfigFsm suspended add-new-rule aborted", log.Fields{
+				"device-id": oFsm.deviceID, "cookie": delayedCookie})
+			return fmt.Errorf(" UniVlanConfigFsm suspended add-new-rule aborted %s", oFsm.deviceID)
+		}
 		requestAppendRule = true //default assumption here is that rule is to be appended
 		flowCookieModify = true  //and that the the flow data base is to be updated
 		if delayedCookie != 0 {  //it was suspended
@@ -713,15 +722,16 @@
 	}
 	return 0 //no delay requested
 }
-func (oFsm *UniVlanConfigFsm) suspendNewRule(ctx context.Context) {
+func (oFsm *UniVlanConfigFsm) suspendNewRule(ctx context.Context) bool {
 	oFsm.mutexFlowParams.RLock()
 	logger.Infow(ctx, "Need to suspend adding this rule as long as the cookie is still connected to some other rule", log.Fields{
 		"device-id": oFsm.deviceID, "cookie": oFsm.delayNewRuleCookie})
 	oFsm.mutexFlowParams.RUnlock()
+	cookieDeleted := true //default assumption also for timeout (just try to continue as if removed)
 	select {
-	case <-oFsm.chCookieDeleted:
-		logger.Infow(ctx, "resume adding this rule after having deleted cookie in some other rule", log.Fields{
-			"device-id": oFsm.deviceID, "cookie": oFsm.delayNewRuleCookie})
+	case cookieDeleted = <-oFsm.chCookieDeleted:
+		logger.Infow(ctx, "resume adding this rule after having deleted cookie in some other rule or abort", log.Fields{
+			"device-id": oFsm.deviceID, "cookie": oFsm.delayNewRuleCookie, "deleted": cookieDeleted})
 	case <-time.After(oFsm.pOmciCC.GetMaxOmciTimeoutWithRetries() * time.Second):
 		logger.Errorw(ctx, "timeout waiting for deletion of cookie in some other rule, just try to continue", log.Fields{
 			"device-id": oFsm.deviceID, "cookie": oFsm.delayNewRuleCookie})
@@ -729,16 +739,18 @@
 	oFsm.mutexFlowParams.Lock()
 	oFsm.delayNewRuleCookie = 0
 	oFsm.mutexFlowParams.Unlock()
+	return cookieDeleted
 }
-func (oFsm *UniVlanConfigFsm) suspendIfRequiredNewRule(ctx context.Context, aCookieSlice []uint64) uint64 {
+func (oFsm *UniVlanConfigFsm) suspendIfRequiredNewRule(ctx context.Context, aCookieSlice []uint64) (uint64, bool) {
 	oFsm.mutexFlowParams.Lock()
 	delayedCookie := oFsm.delayNewRuleForCookie(ctx, aCookieSlice)
 	oFsm.mutexFlowParams.Unlock()
 
+	deleteSuccess := true
 	if delayedCookie != 0 {
-		oFsm.suspendNewRule(ctx)
+		deleteSuccess = oFsm.suspendNewRule(ctx)
 	}
-	return delayedCookie
+	return delayedCookie, deleteSuccess
 }
 
 //returns flowModified, RuleAppendRequest
@@ -1793,7 +1805,7 @@
 	if oFsm.delayNewRuleCookie != 0 {
 		// looks like the waiting AddFlow is stuck
 		oFsm.mutexFlowParams.RUnlock()
-		oFsm.chCookieDeleted <- true // let the waiting AddFlow thread continue/terminate
+		oFsm.chCookieDeleted <- false // let the waiting AddFlow thread terminate
 		oFsm.mutexFlowParams.RLock()
 	}
 	if len(oFsm.uniRemoveFlowsSlice) > 0 {