[VOL-4438] OnuDown indication can collide with FlowDeletion resulting in wrong flowDelete error indication

Signed-off-by: mpagenko <michael.pagenkopf@adtran.com>
Change-Id: I53bfcc7276e812992d6222c1fd40a250e69b8ef2
diff --git a/VERSION b/VERSION
index 61ee492..b6e9f71 100755
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.0.3-dev243
+2.0.3-dev244
diff --git a/internal/pkg/avcfg/omci_vlan_config.go b/internal/pkg/avcfg/omci_vlan_config.go
index df47501..ce6d13a 100755
--- a/internal/pkg/avcfg/omci_vlan_config.go
+++ b/internal/pkg/avcfg/omci_vlan_config.go
@@ -157,7 +157,6 @@
 	omciMIdsResponseReceived    chan bool //seperate channel needed for checking multiInstance OMCI message responses
 	PAdaptFsm                   *cmn.AdapterFsm
 	acceptIncrementalEvtoOption bool
-	clearPersistency            bool
 	isCanceled                  bool
 	isAwaitingResponse          bool
 	mutexIsAwaitingResponse     sync.RWMutex
@@ -204,7 +203,6 @@
 		NumUniFlows:                 0,
 		ConfiguredUniFlow:           0,
 		numRemoveFlows:              0,
-		clearPersistency:            true,
 		lastFlowToReconcile:         lastFlowToRec,
 	}
 
@@ -395,18 +393,6 @@
 	return oFsm.TpIDWaitingFor
 }
 
-//RequestClearPersistency sets the internal flag to not clear persistency data (false=NoClear)
-func (oFsm *UniVlanConfigFsm) RequestClearPersistency(ctx context.Context, aClear bool) {
-	if oFsm == nil {
-		logger.Error(ctx, "no valid UniVlanConfigFsm!")
-		return
-	}
-	//mutex protection is required for possible concurrent access to FSM members
-	oFsm.mutexFlowParams.Lock()
-	defer oFsm.mutexFlowParams.Unlock()
-	oFsm.clearPersistency = aClear
-}
-
 //SetUniFlowParams verifies on existence of flow parameters to be configured,
 // optionally udates the cookie list or appends a new flow if there is space
 // if possible the FSM is trigggerd to start with the processing
@@ -1838,6 +1824,10 @@
 func (oFsm *UniVlanConfigFsm) enterResetting(ctx context.Context, e *fsm.Event) {
 	logger.Debugw(ctx, "UniVlanConfigFsm resetting", log.Fields{"device-id": oFsm.deviceID})
 
+	oFsm.mutexPLastTxMeInstance.Lock()
+	oFsm.pLastTxMeInstance = nil //to avoid misinterpretation in case of some lingering frame reception processing
+	oFsm.mutexPLastTxMeInstance.Unlock()
+
 	pConfigVlanStateAFsm := oFsm.PAdaptFsm
 	if pConfigVlanStateAFsm != nil {
 		// abort running message processing
@@ -1849,68 +1839,98 @@
 		}
 		pConfigVlanStateAFsm.CommChan <- fsmAbortMsg
 
-		//try to restart the FSM to 'disabled'
-		// Can't call FSM Event directly, decoupling it
-		go func(a_pAFsm *cmn.AdapterFsm) {
-			if a_pAFsm != nil && a_pAFsm.PFsm != nil {
-				_ = a_pAFsm.PFsm.Event(VlanEvRestart)
+		//internal data is not explicitly removed, this is left to garbage collection after complete FSM removal
+		//  but some channels have to be cleared to avoid unintended waiting for events, that have no meaning anymore now
+
+		oFsm.mutexFlowParams.RLock()
+		if oFsm.delayNewRuleCookie != 0 {
+			// looks like the waiting AddFlow is stuck
+			oFsm.mutexFlowParams.RUnlock()
+			//use asynchronous channel sending to avoid stucking on non-waiting receiver
+			select {
+			case oFsm.chCookieDeleted <- false: // let the waiting AddFlow thread terminate
+			default:
 			}
-		}(pConfigVlanStateAFsm)
+			oFsm.mutexFlowParams.RLock()
+		}
+		if len(oFsm.uniRemoveFlowsSlice) > 0 {
+			for _, removeUniFlowParams := range oFsm.uniRemoveFlowsSlice {
+				if removeUniFlowParams.isSuspendedOnAdd {
+					removeChannel := removeUniFlowParams.removeChannel
+					logger.Debugw(ctx, "UniVlanConfigFsm flow clear-up - abort suspended rule-add", log.Fields{
+						"device-id": oFsm.deviceID, "cookie": removeUniFlowParams.cookie})
+					oFsm.mutexFlowParams.RUnlock()
+					//use asynchronous channel sending to avoid stucking on non-waiting receiver
+					select {
+					case removeChannel <- false:
+					default:
+					}
+					oFsm.mutexFlowParams.RLock()
+				}
+				// Send response on response channel if the caller is waiting on it.
+				var err error = nil
+				if !oFsm.isCanceled {
+					//only if the FSM is not canceled on external request use some error indication for the respChan
+					//  so only at real internal FSM abortion some error code is sent back
+					//  on the deleteFlow with the hope the system may handle such error situation (possibly retrying)
+					err = fmt.Errorf("internal-error")
+				}
+				//if the FSM was cancelled on external request the assumption is, that all processing has to be stopped
+				//  assumed in connection with some ONU down/removal indication in which case all flows can be considered as removed
+				oFsm.pushReponseOnFlowResponseChannel(ctx, removeUniFlowParams.respChan, err)
+			}
+		}
+
+		if oFsm.pDeviceHandler != nil {
+			if len(oFsm.uniVlanFlowParamsSlice) > 0 {
+				if !oFsm.isCanceled {
+					//if the FSM is not canceled on external request use "internal-error" for the respChan
+					for _, vlanRule := range oFsm.uniVlanFlowParamsSlice {
+						// Send response on response channel if the caller is waiting on it with according error indication.
+						oFsm.pushReponseOnFlowResponseChannel(ctx, vlanRule.RespChan, fmt.Errorf("internal-error"))
+					}
+					//permanently remove possibly stored persistent data
+					var emptySlice = make([]cmn.UniVlanFlowParams, 0)
+					_ = oFsm.pDeviceHandler.StorePersUniFlowConfig(ctx, oFsm.pOnuUniPort.UniID, &emptySlice, true) //ignore errors
+				} else {
+					// reset (cancel) of all Fsm is always accompanied by global persistency data removal
+					//  no need to remove specific data in this case here
+					//  TODO: cancelation may also abort a running flowAdd activity in which case it would be better
+					//     to also resopnd on the respChan with some error ("config canceled"), but that is a bit hard to decide here
+					//     so just left open by now
+					logger.Debugw(ctx, "UniVlanConfigFsm persistency data not cleared", log.Fields{"device-id": oFsm.deviceID})
+				}
+			}
+			oFsm.mutexFlowParams.RUnlock()
+
+			//try to let the FSM proceed to 'disabled'
+			// Can't call FSM Event directly, decoupling it
+			go func(a_pAFsm *cmn.AdapterFsm) {
+				if a_pAFsm != nil && a_pAFsm.PFsm != nil {
+					_ = a_pAFsm.PFsm.Event(VlanEvRestart)
+				}
+			}(pConfigVlanStateAFsm)
+			return
+		}
+		oFsm.mutexFlowParams.RUnlock()
+		logger.Warnw(ctx, "UniVlanConfigFsm - device handler already vanished",
+			log.Fields{"device-id": oFsm.deviceID})
+		return
 	}
+	logger.Warnw(ctx, "UniVlanConfigFsm - FSM pointer already vanished",
+		log.Fields{"device-id": oFsm.deviceID})
 }
 
 func (oFsm *UniVlanConfigFsm) enterDisabled(ctx context.Context, e *fsm.Event) {
 	logger.Debugw(ctx, "UniVlanConfigFsm enters disabled state", log.Fields{"device-id": oFsm.deviceID})
-	oFsm.mutexPLastTxMeInstance.Lock()
-	oFsm.pLastTxMeInstance = nil
-	oFsm.mutexPLastTxMeInstance.Unlock()
-
-	oFsm.mutexFlowParams.RLock()
-	if oFsm.delayNewRuleCookie != 0 {
-		// looks like the waiting AddFlow is stuck
-		oFsm.mutexFlowParams.RUnlock()
-		oFsm.chCookieDeleted <- false // let the waiting AddFlow thread terminate
-		oFsm.mutexFlowParams.RLock()
-	}
-	if len(oFsm.uniRemoveFlowsSlice) > 0 {
-		for _, removeUniFlowParams := range oFsm.uniRemoveFlowsSlice {
-			if removeUniFlowParams.isSuspendedOnAdd {
-				removeChannel := removeUniFlowParams.removeChannel
-				logger.Debugw(ctx, "UniVlanConfigFsm flow clear-up - abort suspended rule-add", log.Fields{
-					"device-id": oFsm.deviceID, "cookie": removeUniFlowParams.cookie})
-				oFsm.mutexFlowParams.RUnlock()
-				removeChannel <- false
-				oFsm.mutexFlowParams.RLock()
-			}
-			// Send response on response channel if the caller is waiting on it.
-			oFsm.pushReponseOnFlowResponseChannel(ctx, removeUniFlowParams.respChan, fmt.Errorf("internal-error"))
-		}
-	}
 
 	if oFsm.pDeviceHandler != nil {
-		//TODO: to clarify with improved error treatment for VlanConfigFsm (timeout,reception) errors
-		//  current code removes the complete FSM including all flow/rule configuration done so far
-		//  this might be a bit to much, it would require fully new flow config from rwCore (at least on OnuDown/up)
-		//  maybe a more sophisticated approach is possible without clearing the data
-		if oFsm.clearPersistency {
-			//permanently remove possibly stored persistent data
-			if len(oFsm.uniVlanFlowParamsSlice) > 0 {
-				for _, vlanRule := range oFsm.uniVlanFlowParamsSlice {
-					// Send response on response channel if the caller is waiting on it.
-					oFsm.pushReponseOnFlowResponseChannel(ctx, vlanRule.RespChan, fmt.Errorf("internal-error"))
-				}
-				var emptySlice = make([]cmn.UniVlanFlowParams, 0)
-				_ = oFsm.pDeviceHandler.StorePersUniFlowConfig(ctx, oFsm.pOnuUniPort.UniID, &emptySlice, true) //ignore errors
-			}
-		} else {
-			logger.Debugw(ctx, "UniVlanConfigFsm persistency data not cleared", log.Fields{"device-id": oFsm.deviceID})
-		}
-		oFsm.mutexFlowParams.RUnlock()
 		//request removal of 'reference' in the Handler (completely clear the FSM and its data)
 		go oFsm.pDeviceHandler.RemoveVlanFilterFsm(ctx, oFsm.pOnuUniPort)
 		return
 	}
-	oFsm.mutexFlowParams.RUnlock()
+	logger.Warnw(ctx, "UniVlanConfigFsm - device handler already vanished",
+		log.Fields{"device-id": oFsm.deviceID})
 }
 
 func (oFsm *UniVlanConfigFsm) processOmciVlanMessages(ctx context.Context) { //ctx context.Context?
diff --git a/internal/pkg/core/device_handler.go b/internal/pkg/core/device_handler.go
index f875553..d1bd5c1 100755
--- a/internal/pkg/core/device_handler.go
+++ b/internal/pkg/core/device_handler.go
@@ -422,7 +422,7 @@
 }
 
 func (dh *deviceHandler) handleDeleteGemPortRequest(ctx context.Context, delGemPortMsg *ic.DeleteGemPortMessage) error {
-	logger.Infow(ctx, "delete-gem-port-request", log.Fields{"device-id": dh.DeviceID})
+	logger.Infow(ctx, "delete-gem-port-request start", log.Fields{"device-id": dh.DeviceID})
 
 	if dh.pOnuTP == nil {
 		//should normally not happen ...
@@ -434,17 +434,21 @@
 	dh.pOnuTP.LockTpProcMutex()
 	defer dh.pOnuTP.UnlockTpProcMutex()
 
-	if delGemPortMsg.UniId > 255 {
+	if delGemPortMsg.UniId >= platform.MaxUnisPerOnu {
+		logger.Errorw(ctx, "delete-gem-port UniId exceeds range", log.Fields{
+			"device-id": dh.DeviceID, "uni-id": delGemPortMsg.UniId})
 		return fmt.Errorf(fmt.Sprintf("received UniId value exceeds range: %d, device-id: %s",
 			delGemPortMsg.UniId, dh.DeviceID))
 	}
 	uniID := uint8(delGemPortMsg.UniId)
 	tpID, err := cmn.GetTpIDFromTpPath(delGemPortMsg.TpInstancePath)
 	if err != nil {
-		logger.Errorw(ctx, "error-extracting-tp-id-from-tp-path", log.Fields{"err": err, "tp-path": delGemPortMsg.TpInstancePath})
+		logger.Errorw(ctx, "error-extracting-tp-id-from-tp-path", log.Fields{
+			"device-id": dh.DeviceID, "err": err, "tp-path": delGemPortMsg.TpInstancePath})
 		return err
 	}
-	logger.Infow(ctx, "delete-gem-port-request", log.Fields{"device-id": dh.DeviceID, "uni-id": uniID, "tpID": tpID, "gem": delGemPortMsg.GemPortId})
+	logger.Infow(ctx, "delete-gem-port-request", log.Fields{
+		"device-id": dh.DeviceID, "uni-id": uniID, "tpID": tpID, "gem": delGemPortMsg.GemPortId})
 	//a removal of some GemPort would never remove the complete TechProfile entry (done on T-Cont)
 
 	return dh.deleteTechProfileResource(ctx, uniID, tpID, delGemPortMsg.TpInstancePath,
@@ -453,7 +457,7 @@
 }
 
 func (dh *deviceHandler) handleDeleteTcontRequest(ctx context.Context, delTcontMsg *ic.DeleteTcontMessage) error {
-	logger.Infow(ctx, "delete-tcont-request", log.Fields{"device-id": dh.DeviceID})
+	logger.Infow(ctx, "delete-tcont-request start", log.Fields{"device-id": dh.DeviceID})
 
 	pDevEntry := dh.GetOnuDeviceEntry(ctx, true)
 	if pDevEntry == nil {
@@ -471,7 +475,9 @@
 	dh.pOnuTP.LockTpProcMutex()
 	defer dh.pOnuTP.UnlockTpProcMutex()
 
-	if delTcontMsg.UniId > 255 {
+	if delTcontMsg.UniId >= platform.MaxUnisPerOnu {
+		logger.Errorw(ctx, "delete-tcont UniId exceeds range", log.Fields{
+			"device-id": dh.DeviceID, "uni-id": delTcontMsg.UniId})
 		return fmt.Errorf(fmt.Sprintf("received UniId value exceeds range: %d, device-id: %s",
 			delTcontMsg.UniId, dh.DeviceID))
 	}
@@ -479,7 +485,8 @@
 	tpPath := delTcontMsg.TpInstancePath
 	tpID, err := cmn.GetTpIDFromTpPath(tpPath)
 	if err != nil {
-		logger.Errorw(ctx, "error-extracting-tp-id-from-tp-path", log.Fields{"err": err, "tp-path": tpPath})
+		logger.Errorw(ctx, "error-extracting-tp-id-from-tp-path", log.Fields{
+			"device-id": dh.DeviceID, "err": err, "tp-path": tpPath})
 		return err
 	}
 	logger.Infow(ctx, "delete-tcont-request", log.Fields{"device-id": dh.DeviceID, "uni-id": uniID, "tpID": tpID, "tcont": delTcontMsg.AllocId})
@@ -2074,9 +2081,6 @@
 			if pVlanFilterFsm, exist := dh.UniVlanConfigFsmMap[uniPort.UniID]; exist {
 				//VlanFilterFsm exists and was already started
 				dh.lockVlanConfig.RUnlock()
-				//reset of all Fsm is always accompanied by global persistency data removal
-				//  no need to remove specific data
-				pVlanFilterFsm.RequestClearPersistency(ctx, false)
 				//ensure the FSM processing is stopped in case waiting for some response
 				pVlanFilterFsm.CancelProcessing(ctx)
 			} else {