[VOL-4023] openonuAdapterGo - Ani side configuration fails while waiting for FlowDeletion, and upgrade looplab/fsm

Signed-off-by: mpagenko <michael.pagenkopf@adtran.com>
Change-Id: I9053d2ee774ff46c6d3a9cff5743a141bf14ad58
diff --git a/internal/pkg/onuadaptercore/device_handler.go b/internal/pkg/onuadaptercore/device_handler.go
index 2d4d67d..93c4fe5 100644
--- a/internal/pkg/onuadaptercore/device_handler.go
+++ b/internal/pkg/onuadaptercore/device_handler.go
@@ -2746,35 +2746,6 @@
 	dh.lockVlanConfig.Unlock()
 }
 
-//ProcessPendingTpDelete processes any pending TP delete (if available)
-func (dh *deviceHandler) ProcessPendingTpDelete(ctx context.Context, apUniPort *onuUniPort, aTpID uint8) {
-	logger.Debugw(ctx, "enter processing pending tp delete", log.Fields{"device-id": dh.deviceID, "tpID": aTpID})
-	if apUniPort == nil {
-		logger.Errorw(ctx, "uni port is nil", log.Fields{"device-id": dh.deviceID})
-		return
-	}
-	k := uniTP{uniID: apUniPort.uniID, tpID: aTpID}
-	if pAniConfigFsm, ok := dh.pOnuTP.pAniConfigFsm[k]; pAniConfigFsm != nil && ok {
-		pAniConfigStatemachine := pAniConfigFsm.pAdaptFsm.pFsm
-		if pAniConfigStatemachine != nil {
-			//If the gem port delete was waiting on flow remove, indicate event that flow remove is done
-			if pAniConfigStatemachine.Is(aniStWaitingFlowRem) {
-				logger.Debugw(ctx, "ani fsm in aniStWaitingFlowRem state - handling aniEvFlowRemDone event",
-					log.Fields{"device-id": dh.deviceID, "tpID": aTpID})
-				if err := pAniConfigStatemachine.Event(aniEvFlowRemDone); err != nil {
-					logger.Warnw(ctx, "AniConfigFsm: can't continue processing", log.Fields{"err": err,
-						"device-id": dh.deviceID, "UniPort": apUniPort.portNo, "tpID": aTpID})
-					return
-				}
-			} else {
-				logger.Debugw(ctx, "ani fsm not in aniStWaitingFlowRem state", log.Fields{"device-id": dh.deviceID, "tpID": aTpID})
-				return
-			}
-		}
-		return
-	}
-}
-
 //startWritingOnuDataToKvStore initiates the KVStore write of ONU persistent data
 func (dh *deviceHandler) startWritingOnuDataToKvStore(ctx context.Context, aPDevEntry *OnuDeviceEntry) error {
 	dh.mutexKvStoreContext.Lock()         //this write routine may (could) be called with the same context,
diff --git a/internal/pkg/onuadaptercore/omci_ani_config.go b/internal/pkg/onuadaptercore/omci_ani_config.go
index 19167f6..c7433c8 100644
--- a/internal/pkg/onuadaptercore/omci_ani_config.go
+++ b/internal/pkg/onuadaptercore/omci_ani_config.go
@@ -127,6 +127,8 @@
 	gemPortAttribsSlice      []ponAniGemPortAttribs
 	pLastTxMeInstance        *me.ManagedEntity
 	requestEventOffset       uint8 //used to indicate ConfigDone or Removed using successor (enum)
+	isWaitingForFlowDelete   bool
+	waitFlowDeleteChannel    chan bool
 }
 
 //newUniPonAniConfigFsm is the 'constructor' for the state machine to config the PON ANI ports of ONU UNI ports via OMCI
@@ -145,6 +147,7 @@
 		chanSet:        false,
 	}
 	instFsm.uniTpKey = uniTP{uniID: apUniPort.uniID, tpID: aTechProfileID}
+	instFsm.waitFlowDeleteChannel = make(chan bool)
 
 	instFsm.pAdaptFsm = NewAdapterFsm(aName, instFsm.deviceID, aCommChannel)
 	if instFsm.pAdaptFsm == nil {
@@ -194,7 +197,7 @@
 			// exceptional treatment for all states except aniStResetting
 			{Name: aniEvReset, Src: []string{aniStStarting, aniStCreatingDot1PMapper, aniStCreatingMBPCD,
 				aniStSettingTconts, aniStCreatingGemNCTPs, aniStCreatingGemIWs, aniStSettingPQs, aniStSettingDot1PMapper,
-				aniStConfigDone, aniStRemovingGemIW, aniStRemovingGemNCTP,
+				aniStConfigDone, aniStRemovingGemIW, aniStWaitingFlowRem, aniStRemovingGemNCTP,
 				aniStResetTcont, aniStRemDot1PMapper, aniStRemAniBPCD, aniStRemoveDone}, Dst: aniStResetting},
 			// the only way to get to resource-cleared disabled state again is via "resseting"
 			{Name: aniEvRestart, Src: []string{aniStResetting}, Dst: aniStDisabled},
@@ -212,6 +215,7 @@
 			("enter_" + aniStSettingDot1PMapper):  func(e *fsm.Event) { instFsm.enterSettingDot1PMapper(ctx, e) },
 			("enter_" + aniStConfigDone):          func(e *fsm.Event) { instFsm.enterAniConfigDone(ctx, e) },
 			("enter_" + aniStRemovingGemIW):       func(e *fsm.Event) { instFsm.enterRemovingGemIW(ctx, e) },
+			("enter_" + aniStWaitingFlowRem):      func(e *fsm.Event) { instFsm.enterWaitingFlowRem(ctx, e) },
 			("enter_" + aniStRemovingGemNCTP):     func(e *fsm.Event) { instFsm.enterRemovingGemNCTP(ctx, e) },
 			("enter_" + aniStResetTcont):          func(e *fsm.Event) { instFsm.enterResettingTcont(ctx, e) },
 			("enter_" + aniStRemDot1PMapper):      func(e *fsm.Event) { instFsm.enterRemoving1pMapper(ctx, e) },
@@ -249,6 +253,10 @@
 		//use channel to indicate that the response waiting shall be aborted
 		oFsm.omciMIdsResponseReceived <- false
 	}
+	if oFsm.isWaitingForFlowDelete {
+		//use channel to indicate that the response waiting shall be aborted
+		oFsm.waitFlowDeleteChannel <- false
+	}
 	// in any case (even if it might be automatically requested by above cancellation of waiting) ensure resetting the FSM
 	pAdaptFsm := oFsm.pAdaptFsm
 	if pAdaptFsm != nil {
@@ -672,9 +680,14 @@
 }
 
 func (oFsm *uniPonAniConfigFsm) enterRemovingGemIW(ctx context.Context, e *fsm.Event) {
-
 	oFsm.pUniTechProf.mutexTPState.Lock()
-	if oFsm.pDeviceHandler.UniVlanConfigFsmMap[oFsm.pOnuUniPort.uniID].IsFlowRemovePending() {
+	//flush the waitFlowDeleteChannel - possibly already/still set by some previous activity
+	select {
+	case <-oFsm.waitFlowDeleteChannel:
+		logger.Debug(ctx, "flushed waitFlowDeleteChannel")
+	default:
+	}
+	if oFsm.pDeviceHandler.UniVlanConfigFsmMap[oFsm.pOnuUniPort.uniID].IsFlowRemovePending(oFsm.waitFlowDeleteChannel) {
 		oFsm.pUniTechProf.mutexTPState.Unlock()
 		logger.Debugw(ctx, "flow remove pending - wait before processing gem port delete",
 			log.Fields{"device-id": oFsm.deviceID, "uni-id": oFsm.pOnuUniPort.uniID, "techProfile-id": oFsm.techProfileID})
@@ -707,6 +720,76 @@
 	oFsm.pLastTxMeInstance = meInstance
 }
 
+func (oFsm *uniPonAniConfigFsm) enterWaitingFlowRem(ctx context.Context, e *fsm.Event) {
+	oFsm.mutexIsAwaitingResponse.Lock()
+	oFsm.isWaitingForFlowDelete = true
+	oFsm.mutexIsAwaitingResponse.Unlock()
+	select {
+	// maybe be also some outside cancel (but no context modeled for the moment ...)
+	// case <-ctx.Done():
+	// 		logger.Infow("LockState-bridge-init message reception canceled", log.Fields{"for device-id": oFsm.deviceID})
+	case <-time.After(10 * time.Second): //give flow processing enough time to finish (but try to be less than rwCore flow timeouts)
+		logger.Warnw(ctx, "uniPonAniConfigFsm WaitingFlowRem timeout", log.Fields{
+			"for device-id": oFsm.deviceID, "uni-id": oFsm.pOnuUniPort.uniID, "techProfile-id": oFsm.techProfileID})
+		oFsm.mutexIsAwaitingResponse.Lock()
+		oFsm.isWaitingForFlowDelete = false
+		oFsm.mutexIsAwaitingResponse.Unlock()
+		//if the flow is not removed as expected we just try to continue with GemPort removal and hope things are clearing up afterwards
+		pConfigAniStateAFsm := oFsm.pAdaptFsm
+		if pConfigAniStateAFsm != nil {
+			// obviously calling some FSM event here directly does not work - so trying to decouple it ...
+			go func(aPAFsm *AdapterFsm) {
+				if aPAFsm != nil && aPAFsm.pFsm != nil {
+					_ = oFsm.pAdaptFsm.pFsm.Event(aniEvFlowRemDone)
+				}
+			}(pConfigAniStateAFsm)
+		} else {
+			logger.Errorw(ctx, "pConfigAniStateAFsm is nil", log.Fields{
+				"device-id": oFsm.deviceID, "uni-id": oFsm.pOnuUniPort.uniID, "techProfile-id": oFsm.techProfileID})
+		}
+		return
+
+	case success := <-oFsm.waitFlowDeleteChannel:
+		if success {
+			logger.Debugw(ctx, "uniPonAniConfigFsm flow removed info received", log.Fields{
+				"device-id": oFsm.deviceID, "uni-id": oFsm.pOnuUniPort.uniID, "techProfile-id": oFsm.techProfileID})
+			oFsm.mutexIsAwaitingResponse.Lock()
+			oFsm.isWaitingForFlowDelete = false
+			oFsm.mutexIsAwaitingResponse.Unlock()
+			pConfigAniStateAFsm := oFsm.pAdaptFsm
+			if pConfigAniStateAFsm != nil {
+				// obviously calling some FSM event here directly does not work - so trying to decouple it ...
+				go func(aPAFsm *AdapterFsm) {
+					if aPAFsm != nil && aPAFsm.pFsm != nil {
+						_ = oFsm.pAdaptFsm.pFsm.Event(aniEvFlowRemDone)
+					}
+				}(pConfigAniStateAFsm)
+			} else {
+				logger.Errorw(ctx, "pConfigAniStateAFsm is nil", log.Fields{
+					"device-id": oFsm.deviceID, "uni-id": oFsm.pOnuUniPort.uniID, "techProfile-id": oFsm.techProfileID})
+			}
+			return
+		}
+		// waiting was aborted (probably on external request)
+		logger.Debugw(ctx, "uniPonAniConfigFsm WaitingFlowRem aborted", log.Fields{
+			"device-id": oFsm.deviceID, "uni-id": oFsm.pOnuUniPort.uniID, "techProfile-id": oFsm.techProfileID})
+		oFsm.mutexIsAwaitingResponse.Lock()
+		oFsm.isWaitingForFlowDelete = false
+		oFsm.mutexIsAwaitingResponse.Unlock()
+		//to be sure we can just generate the reset-event to ensure leaving this state towards 'reset'
+		pConfigAniStateAFsm := oFsm.pAdaptFsm
+		if pConfigAniStateAFsm != nil {
+			// obviously calling some FSM event here directly does not work - so trying to decouple it ...
+			go func(aPAFsm *AdapterFsm) {
+				if aPAFsm != nil && aPAFsm.pFsm != nil {
+					_ = aPAFsm.pFsm.Event(aniEvReset)
+				}
+			}(pConfigAniStateAFsm)
+		}
+		return
+	}
+}
+
 func (oFsm *uniPonAniConfigFsm) enterRemovingGemNCTP(ctx context.Context, e *fsm.Event) {
 	oFsm.pUniTechProf.mutexTPState.Lock()
 	loGemPortID := (*(oFsm.pUniTechProf.mapRemoveGemEntry[oFsm.uniTpKey])).gemPortID
diff --git a/internal/pkg/onuadaptercore/omci_vlan_config.go b/internal/pkg/onuadaptercore/omci_vlan_config.go
index 45e0944..89433d4 100644
--- a/internal/pkg/onuadaptercore/omci_vlan_config.go
+++ b/internal/pkg/onuadaptercore/omci_vlan_config.go
@@ -144,7 +144,8 @@
 	vlanRuleParams uniVlanRuleParams
 }
 
-//UniVlanConfigFsm defines the structure for the state machine to config the PON ANI ports of ONU UNI ports via OMCI
+//UniVlanConfigFsm defines the structure for the state machine for configuration of the VLAN related setting via OMCI
+//  builds upon 'VLAN rules' that are derived from multiple flows
 type UniVlanConfigFsm struct {
 	pDeviceHandler              *deviceHandler
 	deviceID                    string
@@ -160,7 +161,7 @@
 	isAwaitingResponse          bool
 	mutexIsAwaitingResponse     sync.RWMutex
 	mutexFlowParams             sync.RWMutex
-	chCookieDeleted             chan bool //channel to indicate that a specificly indicated cookie was deleted
+	chCookieDeleted             chan bool //channel to indicate that a specific cookie (related to the active rule) was deleted
 	actualUniVlanConfigRule     uniVlanRuleParams
 	uniVlanFlowParamsSlice      []uniVlanFlowParams
 	uniRemoveFlowsSlice         []uniRemoveVlanFlowParams
@@ -173,6 +174,8 @@
 	pLastTxMeInstance           *me.ManagedEntity
 	requestEventOffset          uint8
 	TpIDWaitingFor              uint8
+	signalOnFlowDelete          bool
+	flowDeleteChannel           chan<- bool
 	//cookie value that indicates that a rule to add is delayed by waiting for deletion of some other existing rule with the same cookie
 	delayNewRuleCookie uint64
 }
@@ -341,8 +344,9 @@
 	// in any case (even if it might be automatically requested by above cancellation of waiting) ensure resetting the FSM
 	pAdaptFsm := oFsm.pAdaptFsm
 	if pAdaptFsm != nil {
-		if pAdaptFsm.pFsm != nil {
-			_ = pAdaptFsm.pFsm.Event(vlanEvReset)
+		if fsmErr := pAdaptFsm.pFsm.Event(vlanEvReset); fsmErr != nil {
+			logger.Errorw(ctx, "error in FsmEvent handling UniVlanConfigFsm!",
+				log.Fields{"fsmState": oFsm.pAdaptFsm.pFsm.Current(), "error": fsmErr, "device-id": oFsm.deviceID})
 		}
 	}
 }
@@ -480,9 +484,14 @@
 			if oFsm.pDeviceHandler.isSkipOnuConfigReconciling() {
 				logger.Debugw(ctx, "reconciling - skip omci-config of additional vlan rule",
 					log.Fields{"fsmState": oFsm.pAdaptFsm.pFsm.Current(), "device-id": oFsm.deviceID})
+				//attention: take care to release the mutexFlowParams when calling the FSM directly -
+				//  synchronous FSM 'event/state' functions may rely on this mutex
 				oFsm.mutexFlowParams.Unlock()
 				if pConfigVlanStateBaseFsm.Is(vlanStConfigDone) {
-					_ = pConfigVlanStateBaseFsm.Event(vlanEvSkipOmciConfig)
+					if fsmErr := pConfigVlanStateBaseFsm.Event(vlanEvSkipOmciConfig); fsmErr != nil {
+						logger.Errorw(ctx, "error in FsmEvent handling UniVlanConfigFsm!",
+							log.Fields{"fsmState": oFsm.pAdaptFsm.pFsm.Current(), "error": fsmErr, "device-id": oFsm.deviceID})
+					}
 				}
 				return nil
 			}
@@ -496,9 +505,13 @@
 				if oFsm.configuredUniFlow == 0 {
 					// this is a restart with a complete new flow, we can re-use the initial flow config control
 					// including the check, if the related techProfile is (still) available (probably also removed in between)
-					go func(a_pBaseFsm *fsm.FSM) {
-						_ = a_pBaseFsm.Event(vlanEvRenew)
-					}(pConfigVlanStateBaseFsm)
+					//attention: take care to release the mutexFlowParams when calling the FSM directly -
+					//  synchronous FSM 'event/state' functions may rely on this mutex
+					oFsm.mutexFlowParams.Unlock()
+					if fsmErr := pConfigVlanStateBaseFsm.Event(vlanEvRenew); fsmErr != nil {
+						logger.Errorw(ctx, "error in FsmEvent handling UniVlanConfigFsm!",
+							log.Fields{"fsmState": pConfigVlanStateBaseFsm.Current(), "error": fsmErr, "device-id": oFsm.deviceID})
+					}
 				} else {
 					//some further flows are to be configured
 					//store the actual rule that shall be worked upon in the following transient states
@@ -510,19 +523,28 @@
 					logger.Debugw(ctx, "UniVlanConfigFsm - incremental config request (on setConfig)", log.Fields{
 						"device-id": oFsm.deviceID, "uni-id": oFsm.pOnuUniPort.uniID,
 						"set-Vlan": oFsm.actualUniVlanConfigRule.SetVid, "tp-id": tpID, "ProfDone": loTechProfDone})
-					go func(aPBaseFsm *fsm.FSM, aTechProfDone bool) {
-						if aTechProfDone {
-							// let the vlan processing continue with next rule
-							_ = aPBaseFsm.Event(vlanEvIncrFlowConfig)
-						} else {
-							// set to waiting for Techprofile
-							_ = aPBaseFsm.Event(vlanEvWaitTPIncr)
-						}
-					}(pConfigVlanStateBaseFsm, loTechProfDone)
+
+					//attention: take care to release the mutexFlowParams when calling the FSM directly -
+					//  synchronous FSM 'event/state' functions may rely on this mutex
+					oFsm.mutexFlowParams.Unlock()
+					var fsmErr error
+					if loTechProfDone {
+						// let the vlan processing continue with next rule
+						fsmErr = pConfigVlanStateBaseFsm.Event(vlanEvIncrFlowConfig)
+					} else {
+						// set to waiting for Techprofile
+						fsmErr = pConfigVlanStateBaseFsm.Event(vlanEvWaitTPIncr)
+					}
+					if fsmErr != nil {
+						logger.Errorw(ctx, "error in FsmEvent handling UniVlanConfigFsm!",
+							log.Fields{"fsmState": pConfigVlanStateBaseFsm.Current(), "error": fsmErr, "device-id": oFsm.deviceID})
+					}
 				}
-			} // if not in the appropriate state a new entry will be automatically considered later
-			//   when the configDone state is reached
-			oFsm.mutexFlowParams.Unlock()
+			} else {
+				// if not in the appropriate state a new entry will be automatically considered later
+				//   when the configDone state is reached
+				oFsm.mutexFlowParams.Unlock()
+			}
 		} else {
 			logger.Errorw(ctx, "UniVlanConfigFsm flow limit exceeded", log.Fields{
 				"device-id": oFsm.deviceID, "flow-number": oFsm.numUniFlows})
@@ -733,9 +755,14 @@
 					//trigger the FSM to remove the relevant rule
 					if cancelPendingConfig {
 						oFsm.requestEventOffset = uint8(cDeviceEventOffsetRemoveWithKvStore) //offset for last flow-remove activity (with kvStore request)
-						go func(a_pBaseFsm *fsm.FSM) {
-							_ = a_pBaseFsm.Event(vlanEvCancelOutstandingConfig)
-						}(pConfigVlanStateBaseFsm)
+						//attention: take care to release and re-take the mutexFlowParams when calling the FSM directly -
+						//  synchronous FSM 'event/state' functions may rely on this mutex
+						oFsm.mutexFlowParams.Unlock()
+						if fsmErr := pConfigVlanStateBaseFsm.Event(vlanEvCancelOutstandingConfig); fsmErr != nil {
+							logger.Errorw(ctx, "error in FsmEvent handling UniVlanConfigFsm!",
+								log.Fields{"fsmState": pConfigVlanStateBaseFsm.Current(), "error": fsmErr, "device-id": oFsm.deviceID})
+						}
+						oFsm.mutexFlowParams.Lock()
 					} else {
 						if pConfigVlanStateBaseFsm.Is(vlanStConfigDone) {
 							logger.Debugw(ctx, "UniVlanConfigFsm rule removal request", log.Fields{
@@ -743,10 +770,14 @@
 								"tp-id":    loRemoveParams.vlanRuleParams.TpID,
 								"set-Vlan": loRemoveParams.vlanRuleParams.SetVid})
 							//have to re-trigger the FSM to proceed with outstanding incremental flow configuration
-							// Can't call FSM Event directly, decoupling it
-							go func(a_pBaseFsm *fsm.FSM) {
-								_ = a_pBaseFsm.Event(vlanEvRemFlowConfig)
-							}(pConfigVlanStateBaseFsm)
+							//attention: take care to release and re-take the mutexFlowParams when calling the FSM directly -
+							//  synchronous FSM 'event/state' functions may rely on this mutex
+							oFsm.mutexFlowParams.Unlock()
+							if fsmErr := pConfigVlanStateBaseFsm.Event(vlanEvRemFlowConfig); fsmErr != nil {
+								logger.Errorw(ctx, "error in FsmEvent handling UniVlanConfigFsm!",
+									log.Fields{"fsmState": pConfigVlanStateBaseFsm.Current(), "error": fsmErr, "device-id": oFsm.deviceID})
+							}
+							oFsm.mutexFlowParams.Lock()
 						} // if not in the appropriate state a new entry will be automatically considered later
 						//   when the configDone state is reached
 					}
@@ -1350,7 +1381,7 @@
 		}(pConfigVlanStateAFsm)
 	}
 
-	oFsm.mutexFlowParams.RLock()
+	oFsm.mutexFlowParams.Lock()
 	noOfFlowRem := len(oFsm.uniRemoveFlowsSlice)
 	if deletedCookie == oFsm.delayNewRuleCookie {
 		// flush the channel CookieDeleted to ensure it is not lingering from some previous (aborted) activity
@@ -1361,13 +1392,16 @@
 		}
 		oFsm.chCookieDeleted <- true // let the waiting AddFlow thread continue
 	}
-	oFsm.mutexFlowParams.RUnlock()
-	// If all pending flow removes are completed and TP ID is valid, processing any pending TP delete
-	if noOfFlowRem == 0 && tpID > 0 {
-		logger.Debugw(ctx, "processing pending tp delete", log.Fields{"device-id": oFsm.deviceID, "tpID": tpID})
+	// If all pending flow-removes are completed and TP ID is valid go on processing any pending TP delete
+	if oFsm.signalOnFlowDelete && noOfFlowRem == 0 && tpID > 0 {
+		logger.Debugw(ctx, "signal flow removal for pending TP delete", log.Fields{"device-id": oFsm.deviceID, "tpID": tpID})
 		// If we are here then all flows are removed.
-		oFsm.pDeviceHandler.ProcessPendingTpDelete(ctx, oFsm.pOnuUniPort, tpID)
+		if len(oFsm.flowDeleteChannel) == 0 { //channel not yet in use
+			oFsm.flowDeleteChannel <- true
+			oFsm.signalOnFlowDelete = false
+		}
 	}
+	oFsm.mutexFlowParams.Unlock()
 }
 
 func (oFsm *UniVlanConfigFsm) enterResetting(ctx context.Context, e *fsm.Event) {
@@ -2358,8 +2392,14 @@
 }
 
 // IsFlowRemovePending returns true if there are pending flows to remove, else false.
-func (oFsm *UniVlanConfigFsm) IsFlowRemovePending() bool {
-	oFsm.mutexFlowParams.RLock()
-	defer oFsm.mutexFlowParams.RUnlock()
-	return len(oFsm.uniRemoveFlowsSlice) > 0
+func (oFsm *UniVlanConfigFsm) IsFlowRemovePending(aFlowDeleteChannel chan<- bool) bool {
+	oFsm.mutexFlowParams.Lock()
+	defer oFsm.mutexFlowParams.Unlock()
+	if len(oFsm.uniRemoveFlowsSlice) > 0 {
+		//flow removal is still ongoing/pending
+		oFsm.signalOnFlowDelete = true
+		oFsm.flowDeleteChannel = aFlowDeleteChannel
+		return true
+	}
+	return false
 }