[VOL-4253] R2.8 OpenOnuAdapter Techprofile configuration/removal deadlock when conflicting with ONU down handling

Signed-off-by: mpagenko <michael.pagenkopf@adtran.com>
Change-Id: Id47109294d0d8df210ce97b3d441642dda98e2c6
diff --git a/VERSION b/VERSION
index 80e78df..95b25ae 100755
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.3.5
+1.3.6
diff --git a/internal/pkg/onuadaptercore/omci_ani_config.go b/internal/pkg/onuadaptercore/omci_ani_config.go
index 340dd1b..4473c7e 100644
--- a/internal/pkg/onuadaptercore/omci_ani_config.go
+++ b/internal/pkg/onuadaptercore/omci_ani_config.go
@@ -286,12 +286,11 @@
 		}(pAdaptFsm)
 	}
 
-	//wait for completion of possibly ongoing techprofile config/remove requests to avoid
-	// access conflicts on internal data by next needed data clearance
-	//activity should be aborted in short time if running with FSM due to above FSM reset
-	//  or finished without FSM dependency in short time
-	oFsm.pUniTechProf.lockTpProcMutex()
-	defer oFsm.pUniTechProf.unlockTpProcMutex()
+	// possible access conflicts on internal data by next needed data clearance
+	//   are avoided by using mutexTPState also from within clearAniSideConfig
+	//   do not try to lock TpProcMutex here as done in previous code version
+	//   as it may result in deadlock situations (as observed at soft-reboot handling where
+	//   TpProcMutex is already locked by some ongoing TechProfile config/removal processing
 	//remove all TechProf related internal data to allow for new configuration
 	oFsm.pUniTechProf.clearAniSideConfig(ctx, oFsm.pOnuUniPort.uniID, oFsm.techProfileID)
 }
@@ -324,11 +323,6 @@
 		tcontInstID, tcontAlreadyExist, err := pDevEntry.allocateFreeTcont(ctx, oFsm.pUniTechProf.mapPonAniConfig[oFsm.uniTpKey].tcontParams.allocID)
 		if err != nil {
 			logger.Errorw(ctx, "No TCont instances found", log.Fields{"device-id": oFsm.deviceID, "err": err})
-			if oFsm.chanSet {
-				// indicate processing error/abort to the caller
-				oFsm.chSuccess <- 0
-				oFsm.chanSet = false //reset the internal channel state
-			}
 			//reset the state machine to enable usage on subsequent requests
 			_ = aPAFsm.pFsm.Event(aniEvReset)
 			return
@@ -1081,6 +1075,21 @@
 	logger.Debugw(ctx, "uniPonAniConfigFsm resetting", log.Fields{
 		"device-id": oFsm.deviceID, "uni-id": oFsm.pOnuUniPort.uniID})
 
+	if oFsm.isChanSet() {
+		// indicate processing error to the caller (in case there was still some open request)
+		logger.Debugw(ctx, "uniPonAniConfigFsm processingError on channel", log.Fields{
+			"ProcessingStep": oFsm.procStep, "from_State": e.FSM.Current(), "device-id": oFsm.deviceID})
+		//use non-blocking channel send to avoid blocking because of non-existing receiver
+		//  (even though the channel is checked on 'set', the outside receiver channel might (theoretically) already be deleted)
+		select {
+		case oFsm.chSuccess <- 0:
+		default:
+			logger.Debugw(ctx, "uniPonAniConfigFsm processingError not send on channel (no receiver)", log.Fields{
+				"device-id": oFsm.deviceID})
+		}
+		oFsm.setChanSet(false) //reset the internal channel state
+	}
+
 	pConfigAniStateAFsm := oFsm.pAdaptFsm
 	if pConfigAniStateAFsm != nil {
 		// abort running message processing
diff --git a/internal/pkg/onuadaptercore/omci_vlan_config.go b/internal/pkg/onuadaptercore/omci_vlan_config.go
index 973d469..56deddf 100644
--- a/internal/pkg/onuadaptercore/omci_vlan_config.go
+++ b/internal/pkg/onuadaptercore/omci_vlan_config.go
@@ -589,15 +589,18 @@
 					oFsm.actualUniVlanConfigMeter = oFsm.uniVlanFlowParamsSlice[oFsm.configuredUniFlow].Meter
 					//tpId of the next rule to be configured
 					tpID := oFsm.actualUniVlanConfigRule.TpID
-					loTechProfDone := oFsm.pUniTechProf.getTechProfileDone(ctx, oFsm.pOnuUniPort.uniID, tpID)
 					oFsm.TpIDWaitingFor = tpID
-					logger.Debugw(ctx, "UniVlanConfigFsm - incremental config request (on setConfig)", log.Fields{
-						"device-id": oFsm.deviceID, "uni-id": oFsm.pOnuUniPort.uniID,
-						"set-Vlan": oFsm.actualUniVlanConfigRule.SetVid, "tp-id": tpID, "ProfDone": loTechProfDone})
-
+					loSetVlan := oFsm.actualUniVlanConfigRule.SetVid
 					//attention: take care to release the mutexFlowParams when calling the FSM directly -
 					//  synchronous FSM 'event/state' functions may rely on this mutex
+					//  but it must be released already before calling getTechProfileDone() as it may already be locked
+					//  by the techProfile processing call to VlanFsm.IsFlowRemovePending() (see VOL-4207)
 					oFsm.mutexFlowParams.Unlock()
+					loTechProfDone := oFsm.pUniTechProf.getTechProfileDone(ctx, oFsm.pOnuUniPort.uniID, tpID)
+					logger.Debugw(ctx, "UniVlanConfigFsm - incremental config request (on setConfig)", log.Fields{
+						"device-id": oFsm.deviceID, "uni-id": oFsm.pOnuUniPort.uniID,
+						"set-Vlan": loSetVlan, "tp-id": tpID, "ProfDone": loTechProfDone})
+
 					var fsmErr error
 					if loTechProfDone {
 						// let the vlan processing continue with next rule
@@ -1091,12 +1094,16 @@
 		oFsm.TpIDWaitingFor = tpID
 		//cmp also usage in EVTOCDE create in omci_cc
 		oFsm.evtocdID = macBridgeServiceProfileEID + uint16(oFsm.pOnuUniPort.macBpNo)
+		loSetVlan := oFsm.actualUniVlanConfigRule.SetVid
+		//attention: take care to release the mutexFlowParams when calling the FSM directly -
+		//  synchronous FSM 'event/state' functions may rely on this mutex
+		//  but it must be released already before calling getTechProfileDone() as it may already be locked
+		//  by the techProfile processing call to VlanFsm.IsFlowRemovePending() (see VOL-4207)
 		oFsm.mutexFlowParams.Unlock()
-
 		loTechProfDone := oFsm.pUniTechProf.getTechProfileDone(ctx, oFsm.pOnuUniPort.uniID, uint8(tpID))
 		logger.Debugw(ctx, "UniVlanConfigFsm - start with first rule", log.Fields{
 			"device-id": oFsm.deviceID, "uni-id": oFsm.pOnuUniPort.uniID,
-			"set-Vlan": oFsm.actualUniVlanConfigRule.SetVid, "tp-id": tpID, "ProfDone": loTechProfDone})
+			"set-Vlan": loSetVlan, "tp-id": tpID, "ProfDone": loTechProfDone})
 
 		// Can't call FSM Event directly, decoupling it
 		go func(aPAFsm *AdapterFsm, aTechProfDone bool) {
@@ -1300,11 +1307,17 @@
 		//tpId of the next rule to be configured
 		tpID := oFsm.actualUniVlanConfigRule.TpID
 		oFsm.TpIDWaitingFor = tpID
+		loSetVlan := oFsm.actualUniVlanConfigRule.SetVid
+		//attention: take care to release the mutexFlowParams when calling the FSM directly -
+		//  synchronous FSM 'event/state' functions may rely on this mutex
+		//  but it must be released already before calling getTechProfileDone() as it may already be locked
+		//  by the techProfile processing call to VlanFsm.IsFlowRemovePending() (see VOL-4207)
+		oFsm.mutexFlowParams.Unlock()
 		loTechProfDone := oFsm.pUniTechProf.getTechProfileDone(ctx, oFsm.pOnuUniPort.uniID, tpID)
 		logger.Debugw(ctx, "UniVlanConfigFsm - incremental config request", log.Fields{
 			"device-id": oFsm.deviceID, "uni-id": oFsm.pOnuUniPort.uniID,
-			"set-Vlan": oFsm.actualUniVlanConfigRule.SetVid, "tp-id": tpID, "ProfDone": loTechProfDone})
-		oFsm.mutexFlowParams.Unlock()
+			"set-Vlan": loSetVlan, "tp-id": tpID, "ProfDone": loTechProfDone})
+
 		// Can't call FSM Event directly, decoupling it
 		go func(aPBaseFsm *fsm.FSM, aTechProfDone bool) {
 			if aTechProfDone {