[VOL-3889] missing device reason update from flow removal after ONU reboot

Signed-off-by: mpagenko <michael.pagenkopf@adtran.com>
Change-Id: I1e3eb63e044901c814f453b99388db56ae171d00
diff --git a/internal/pkg/onuadaptercore/device_handler.go b/internal/pkg/onuadaptercore/device_handler.go
index 59cd8d0..b720f4f 100644
--- a/internal/pkg/onuadaptercore/device_handler.go
+++ b/internal/pkg/onuadaptercore/device_handler.go
@@ -1669,23 +1669,20 @@
 		// FSM  stop maybe encapsulated as OnuTP method - perhaps later in context of module splitting
 		if dh.pOnuTP.pAniConfigFsm != nil {
 			for uniTP := range dh.pOnuTP.pAniConfigFsm {
-				_ = dh.pOnuTP.pAniConfigFsm[uniTP].pAdaptFsm.pFsm.Event(aniEvReset)
+				dh.pOnuTP.pAniConfigFsm[uniTP].CancelProcessing()
 			}
 		}
 		for _, uniPort := range dh.uniEntityMap {
 			// reset the possibly existing VlanConfigFsm
 			dh.lockVlanConfig.RLock()
 			if pVlanFilterFsm, exist := dh.UniVlanConfigFsmMap[uniPort.uniID]; exist {
-				dh.lockVlanConfig.RUnlock()
 				//VlanFilterFsm exists and was already started
-				pVlanFilterStatemachine := pVlanFilterFsm.pAdaptFsm.pFsm
-				if pVlanFilterStatemachine != nil {
-					//reset of all Fsm is always accompanied by global persistency data removal
-					//  no need to remove specific data
-					pVlanFilterFsm.RequestClearPersistency(false)
-					//and reset the UniVlanConfig FSM
-					_ = pVlanFilterStatemachine.Event(vlanEvReset)
-				}
+				dh.lockVlanConfig.RUnlock()
+				//reset of all Fsm is always accompanied by global persistency data removal
+				//  no need to remove specific data
+				pVlanFilterFsm.RequestClearPersistency(false)
+				//ensure the FSM processing is stopped in case waiting for some response
+				pVlanFilterFsm.CancelProcessing()
 			} else {
 				dh.lockVlanConfig.RUnlock()
 			}
@@ -1708,6 +1705,7 @@
 	}
 	dh.lockUpgradeFsm.RUnlock()
 
+	logger.Infow(ctx, "resetFsms done", log.Fields{"device-id": dh.deviceID})
 	return nil
 }
 
@@ -2596,8 +2594,10 @@
 		dh.pOpenOnuAc.AcceptIncrementalEvto, aCookieSlice, aMatchVlan, aSetVlan, aSetPcp)
 	if pVlanFilterFsm != nil {
 		dh.lockVlanConfig.Lock()
+		//ensure the mutex is locked throughout the state transition to 'starting' to prevent unintended (ignored) events to be sent there
+		//  (from parallel processing)
+		defer dh.lockVlanConfig.Unlock()
 		dh.UniVlanConfigFsmMap[apUniPort.uniID] = pVlanFilterFsm
-		dh.lockVlanConfig.Unlock()
 		pVlanFilterStatemachine := pVlanFilterFsm.pAdaptFsm.pFsm
 		if pVlanFilterStatemachine != nil {
 			if pVlanFilterStatemachine.Is(vlanStDisabled) {
diff --git a/internal/pkg/onuadaptercore/omci_ani_config.go b/internal/pkg/onuadaptercore/omci_ani_config.go
index db0b916..01b2f34 100644
--- a/internal/pkg/onuadaptercore/omci_ani_config.go
+++ b/internal/pkg/onuadaptercore/omci_ani_config.go
@@ -23,6 +23,7 @@
 	"fmt"
 	"net"
 	"strconv"
+	"sync"
 	"time"
 
 	"github.com/cevaris/ordered_map"
@@ -112,6 +113,8 @@
 	techProfileID            uint8
 	uniTpKey                 uniTP
 	requestEvent             OnuDeviceEvent
+	mutexIsAwaitingResponse  sync.RWMutex
+	isAwaitingResponse       bool
 	omciMIdsResponseReceived chan bool //separate channel needed for checking multiInstance OMCI message responses
 	pAdaptFsm                *AdapterFsm
 	chSuccess                chan<- uint8
@@ -235,6 +238,27 @@
 	oFsm.chanSet = true
 }
 
+//CancelProcessing ensures that suspended processing at waiting on some response is aborted and reset of FSM
+func (oFsm *uniPonAniConfigFsm) CancelProcessing() {
+	//mutex protection is required for possible concurrent access to FSM members
+	oFsm.mutexIsAwaitingResponse.RLock()
+	defer oFsm.mutexIsAwaitingResponse.RUnlock()
+	if oFsm.isAwaitingResponse {
+		//use channel to indicate that the response waiting shall be aborted
+		oFsm.omciMIdsResponseReceived <- false
+	}
+	// in any case (even if it might be automatically requested by above cancellation of waiting) ensure resetting the FSM
+	pAdaptFsm := oFsm.pAdaptFsm
+	if pAdaptFsm != nil {
+		// obviously calling some FSM event here directly does not work - so trying to decouple it ...
+		go func(aPAFsm *AdapterFsm) {
+			if aPAFsm.pFsm != nil {
+				_ = oFsm.pAdaptFsm.pFsm.Event(aniEvReset)
+			}
+		}(pAdaptFsm)
+	}
+}
+
 func (oFsm *uniPonAniConfigFsm) prepareAndEnterConfigState(ctx context.Context, aPAFsm *AdapterFsm) {
 	if aPAFsm != nil && aPAFsm.pFsm != nil {
 		//stick to pythonAdapter numbering scheme
@@ -1191,20 +1215,32 @@
 }
 
 func (oFsm *uniPonAniConfigFsm) waitforOmciResponse(ctx context.Context) error {
+	oFsm.mutexIsAwaitingResponse.Lock()
+	oFsm.isAwaitingResponse = true
+	oFsm.mutexIsAwaitingResponse.Unlock()
 	select {
 	// maybe be also some outside cancel (but no context modeled for the moment ...)
 	// case <-ctx.Done():
 	// 		logger.Infow("LockState-bridge-init message reception canceled", log.Fields{"for device-id": oFsm.deviceID})
 	case <-time.After(30 * time.Second): //3s was detected to be to less in 8*8 bbsim test with debug Info/Debug
 		logger.Warnw(ctx, "UniPonAniConfigFsm multi entity timeout", log.Fields{"for device-id": oFsm.deviceID})
+		oFsm.mutexIsAwaitingResponse.Lock()
+		oFsm.isAwaitingResponse = false
+		oFsm.mutexIsAwaitingResponse.Unlock()
 		return fmt.Errorf("uniPonAniConfigFsm multi entity timeout %s", oFsm.deviceID)
 	case success := <-oFsm.omciMIdsResponseReceived:
 		if success {
 			logger.Debug(ctx, "uniPonAniConfigFsm multi entity response received")
+			oFsm.mutexIsAwaitingResponse.Lock()
+			oFsm.isAwaitingResponse = false
+			oFsm.mutexIsAwaitingResponse.Unlock()
 			return nil
 		}
-		// should not happen so far
-		logger.Warnw(ctx, "uniPonAniConfigFsm multi entity response error", log.Fields{"for device-id": oFsm.deviceID})
-		return fmt.Errorf("uniPonAniConfigFsm multi entity responseError %s", oFsm.deviceID)
+		// waiting was aborted (probably on external request)
+		logger.Debugw(ctx, "uniPonAniConfigFsm wait for multi entity response aborted", log.Fields{"for device-id": oFsm.deviceID})
+		oFsm.mutexIsAwaitingResponse.Lock()
+		oFsm.isAwaitingResponse = false
+		oFsm.mutexIsAwaitingResponse.Unlock()
+		return fmt.Errorf(cErrWaitAborted)
 	}
 }
diff --git a/internal/pkg/onuadaptercore/omci_vlan_config.go b/internal/pkg/onuadaptercore/omci_vlan_config.go
index 912af08..39a7bee 100644
--- a/internal/pkg/onuadaptercore/omci_vlan_config.go
+++ b/internal/pkg/onuadaptercore/omci_vlan_config.go
@@ -157,6 +157,8 @@
 	pAdaptFsm                   *AdapterFsm
 	acceptIncrementalEvtoOption bool
 	clearPersistency            bool
+	isAwaitingResponse          bool
+	mutexIsAwaitingResponse     sync.RWMutex
 	mutexFlowParams             sync.RWMutex
 	chCookieDeleted             chan bool //channel to indicate that a specificly indicated cookie was deleted
 	actualUniVlanConfigRule     uniVlanRuleParams
@@ -327,6 +329,27 @@
 	return nil
 }
 
+//CancelProcessing ensures that suspended processing at waiting on some response is aborted and reset of FSM
+func (oFsm *UniVlanConfigFsm) CancelProcessing() {
+	//mutex protection is required for possible concurrent access to FSM members
+	oFsm.mutexIsAwaitingResponse.RLock()
+	defer oFsm.mutexIsAwaitingResponse.RUnlock()
+	if oFsm.isAwaitingResponse {
+		//use channel to indicate that the response waiting shall be aborted
+		oFsm.omciMIdsResponseReceived <- false
+	}
+	// in any case (even if it might be automatically requested by above cancellation of waiting) ensure resetting the FSM
+	pAdaptFsm := oFsm.pAdaptFsm
+	if pAdaptFsm != nil {
+		// obviously calling some FSM event here directly does not work - so trying to decouple it ...
+		go func(aPAFsm *AdapterFsm) {
+			if aPAFsm.pFsm != nil {
+				_ = oFsm.pAdaptFsm.pFsm.Event(vlanEvReset)
+			}
+		}(pAdaptFsm)
+	}
+}
+
 //GetWaitingTpID returns the TpId that the FSM might be waiting for continuation (0 if none)
 func (oFsm *UniVlanConfigFsm) GetWaitingTpID() uint8 {
 	//mutex protection is required for possible concurrent access to FSM members
@@ -2095,21 +2118,33 @@
 }
 
 func (oFsm *UniVlanConfigFsm) waitforOmciResponse(ctx context.Context) error {
+	oFsm.mutexIsAwaitingResponse.Lock()
+	oFsm.isAwaitingResponse = true
+	oFsm.mutexIsAwaitingResponse.Unlock()
 	select {
 	// maybe be also some outside cancel (but no context modeled for the moment ...)
 	// case <-ctx.Done():
 	// 		logger.Infow(ctx,"LockState-bridge-init message reception canceled", log.Fields{"for device-id": oFsm.deviceID})
 	case <-time.After(30 * time.Second): //AS FOR THE OTHER OMCI FSM's
 		logger.Warnw(ctx, "UniVlanConfigFsm multi entity timeout", log.Fields{"for device-id": oFsm.deviceID})
+		oFsm.mutexIsAwaitingResponse.Lock()
+		oFsm.isAwaitingResponse = false
+		oFsm.mutexIsAwaitingResponse.Unlock()
 		return fmt.Errorf("uniVlanConfigFsm multi entity timeout %s", oFsm.deviceID)
 	case success := <-oFsm.omciMIdsResponseReceived:
 		if success {
 			logger.Debug(ctx, "UniVlanConfigFsm multi entity response received")
+			oFsm.mutexIsAwaitingResponse.Lock()
+			oFsm.isAwaitingResponse = false
+			oFsm.mutexIsAwaitingResponse.Unlock()
 			return nil
 		}
-		// should not happen so far
-		logger.Warnw(ctx, "UniVlanConfigFsm multi entity response error", log.Fields{"for device-id": oFsm.deviceID})
-		return fmt.Errorf("uniVlanConfigFsm multi entity responseError %s", oFsm.deviceID)
+		// waiting was aborted (probably on external request)
+		logger.Debugw(ctx, "UniVlanConfigFsm wait for multi entity response aborted", log.Fields{"for device-id": oFsm.deviceID})
+		oFsm.mutexIsAwaitingResponse.Lock()
+		oFsm.isAwaitingResponse = false
+		oFsm.mutexIsAwaitingResponse.Unlock()
+		return fmt.Errorf(cErrWaitAborted)
 	}
 }
 
diff --git a/internal/pkg/onuadaptercore/onu_device_entry.go b/internal/pkg/onuadaptercore/onu_device_entry.go
index 86c34cf..db33270 100644
--- a/internal/pkg/onuadaptercore/onu_device_entry.go
+++ b/internal/pkg/onuadaptercore/onu_device_entry.go
@@ -160,6 +160,11 @@
 	// Add other events here as needed (alarms separate???)
 )
 
+//AdapterFsm related error string
+//error string could be checked on waitforOmciResponse() e.g. to avoid misleading error log
+// but not used that way so far (permit error log even for wanted cancellation)
+const cErrWaitAborted = "waitResponse aborted"
+
 type activityDescr struct {
 	databaseClass func(context.Context) error
 	//advertiseEvents bool
diff --git a/internal/pkg/onuadaptercore/onu_uni_tp.go b/internal/pkg/onuadaptercore/onu_uni_tp.go
index 3fdbe19..ac59445 100644
--- a/internal/pkg/onuadaptercore/onu_uni_tp.go
+++ b/internal/pkg/onuadaptercore/onu_uni_tp.go
@@ -552,6 +552,7 @@
 		"device-id": onuTP.deviceID, "uni-id": aUniID, "path": aPathString, "Resource": aResource})
 	uniTPKey := uniTP{uniID: aUniID, tpID: aTpID}
 
+	bDeviceProcStatusUpdate := true
 	if cResourceGemPort == aResource {
 		logger.Debugw(ctx, "remove GemPort from the list of existing ones of the TP", log.Fields{
 			"device-id": onuTP.deviceID, "uni-id": aUniID, "path": aPathString, "GemPort": aEntryID})
@@ -640,6 +641,12 @@
 				return
 			}
 		} else {
+			//if we can't do the OMCI processing we also suppress the ProcStatusUpdate
+			//this is needed as in the device-down case where all FSM's are getting reset and internal data gets cleared
+			//as a consequence a possible remove-flow does not see any dependency on the TechProfile anymore and is executed (pro forma) directly
+			//a later TechProfile removal would cause the device-reason to be updated to 'techProfile-delete-success' which is not the expected state
+			// and anyway is no real useful information at that stage
+			bDeviceProcStatusUpdate = false
 			logger.Debugw(ctx, "uniPonAniConfigFsm delete Gem on OMCI skipped based on device state", log.Fields{
 				"device-id": onuTP.deviceID, "device-state": onuTP.baseDeviceHandler.getDeviceReasonString()})
 		}
@@ -720,6 +727,8 @@
 				return
 			}
 		} else {
+			//see gemPort comments
+			bDeviceProcStatusUpdate = false
 			logger.Debugw(ctx, "uniPonAniConfigFsm TCont cleanup on OMCI skipped based on device state", log.Fields{
 				"device-id": onuTP.deviceID, "device-state": onuTP.baseDeviceHandler.getDeviceReasonString()})
 		}
@@ -729,8 +738,10 @@
 		// FSM stop maybe encapsulated as OnuTP method - perhaps later in context of module splitting
 		_ = onuTP.pAniConfigFsm[uniTPKey].pAdaptFsm.pFsm.Event(aniEvReset)
 	}
-	// generate deviceHandler StatusEvent in case the FSM was not invoked
-	go onuTP.baseDeviceHandler.deviceProcStatusUpdate(ctx, OmciAniResourceRemoved)
+	if bDeviceProcStatusUpdate {
+		// generate deviceHandler StatusEvent in case the FSM was not invoked and OMCI processing not locked due to device state
+		go onuTP.baseDeviceHandler.deviceProcStatusUpdate(ctx, OmciAniResourceRemoved)
+	}
 }
 
 func (onuTP *onuUniTechProf) waitForTimeoutOrCompletion(