[VOL-4307] OpenOnuAdapter OnuImage download to adapter: wrong error indication and possibly adapter crash

Signed-off-by: mpagenko <michael.pagenkopf@adtran.com>
Change-Id: Iffb22de92513d1b24f3409d6fc6a631a6bfb98e1
diff --git a/internal/pkg/onuadaptercore/omci_onu_upgrade.go b/internal/pkg/onuadaptercore/omci_onu_upgrade.go
index b4c4d35..e687f48 100644
--- a/internal/pkg/onuadaptercore/omci_onu_upgrade.go
+++ b/internal/pkg/onuadaptercore/omci_onu_upgrade.go
@@ -135,9 +135,7 @@
 	mutexIsAwaitingAdapterDlResponse sync.RWMutex
 	chAdapterDlReady                 chan bool
 	isWaitingForAdapterDlResponse    bool
-	mutexIsAwaitingOnuDlResponse     sync.RWMutex
 	chOnuDlReady                     chan bool
-	isWaitingForOnuDlResponse        bool
 	activateImage                    bool
 	commitImage                      bool
 	mutexAbortRequest                sync.RWMutex
@@ -596,7 +594,7 @@
 	}
 	if err != nil || fileLen > int64(cMaxUint32) {
 		oFsm.volthaDownloadState = voltha.ImageState_DOWNLOAD_FAILED
-		oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //Something like 'LOCAL_FILE_ERROR' would be better (proto)
+		oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //something like 'LOCAL_FILE_ERROR' would be better (proto)
 		oFsm.volthaImageState = voltha.ImageState_IMAGE_UNKNOWN
 		oFsm.mutexUpgradeParams.Unlock()
 		logger.Errorw(ctx, "OnuUpgradeFsm abort: problems getting image buffer length", log.Fields{
@@ -618,7 +616,7 @@
 	}
 	if err != nil {
 		oFsm.volthaDownloadState = voltha.ImageState_DOWNLOAD_FAILED
-		oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //Something like 'LOCAL_FILE_ERROR' would be better (proto)
+		oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //something like 'LOCAL_FILE_ERROR' would be better (proto)
 		oFsm.volthaImageState = voltha.ImageState_IMAGE_UNKNOWN
 		oFsm.mutexUpgradeParams.Unlock()
 		logger.Errorw(ctx, "OnuUpgradeFsm abort: can't get image buffer", log.Fields{
@@ -646,6 +644,13 @@
 	//"NumberOfCircuitPacks": oFsm.numberCircuitPacks, "CircuitPacks MeId": 0}) //parallel circuit packs download not supported
 
 	oFsm.mutexUpgradeParams.Unlock()
+
+	// flush commMetricsChan
+	select {
+	case <-oFsm.chOnuDlReady:
+		logger.Debug(ctx, "flushed OnuDlReady channel")
+	default:
+	}
 	go oFsm.waitOnDownloadToOnuReady(ctx, oFsm.chOnuDlReady) // start supervision of the complete download-to-ONU procedure
 
 	err = oFsm.pOmciCC.sendStartSoftwareDownload(log.WithSpanFromContext(context.TODO(), ctx), oFsm.pDeviceHandler.pOpenOnuAc.omciTimeout, false,
@@ -706,7 +711,7 @@
 				"device-id": oFsm.deviceID, "bufferStartOffset": bufferStartOffset,
 				"bufferEndOffset": bufferEndOffset, "imageLength": oFsm.imageLength})
 			oFsm.volthaDownloadState = voltha.ImageState_DOWNLOAD_FAILED
-			oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //Something like 'LOCAL_FILE_ERROR' would be better (proto)
+			oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //something like 'LOCAL_FILE_ERROR' would be better (proto)
 			oFsm.mutexUpgradeParams.Unlock()
 			//logical error -- reset the FSM
 			pBaseFsm := oFsm.pAdaptFsm
@@ -821,7 +826,7 @@
 		}
 		oFsm.mutexUpgradeParams.Lock()
 		oFsm.volthaDownloadState = voltha.ImageState_DOWNLOAD_FAILED
-		oFsm.volthaDownloadReason = voltha.ImageState_IMAGE_REFUSED_BY_ONU //Something like 'END_DOWNLOAD_TIMEOUT' would be better (proto)
+		oFsm.volthaDownloadReason = voltha.ImageState_IMAGE_REFUSED_BY_ONU //something like 'END_DOWNLOAD_TIMEOUT' would be better (proto)
 		oFsm.mutexUpgradeParams.Unlock()
 		go func(a_pAFsm *AdapterFsm) {
 			_ = a_pAFsm.pFsm.Event(upgradeEvAbort)
@@ -989,13 +994,11 @@
 	}
 
 	// in case the download-to-ONU timer is still running - cancel it
-	oFsm.mutexIsAwaitingOnuDlResponse.RLock()
-	if oFsm.isWaitingForOnuDlResponse {
-		oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
-		//use channel to indicate that the download response waiting shall be aborted for this device (channel)
-		oFsm.chOnuDlReady <- false
-	} else {
-		oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
+	//use non-blocking channel (to be independent from receiver state)
+	select {
+	//use channel to indicate that the download response waiting shall be aborted for this device (channel)
+	case oFsm.chOnuDlReady <- false:
+	default:
 	}
 
 	pConfigupgradeStateAFsm := oFsm.pAdaptFsm
@@ -1239,16 +1242,11 @@
 					//  - assume new loaded image as valid-inactive immediately
 					oFsm.volthaImageState = voltha.ImageState_IMAGE_INACTIVE
 					oFsm.mutexUpgradeParams.Unlock()
-					oFsm.mutexIsAwaitingOnuDlResponse.RLock()
-					if oFsm.isWaitingForOnuDlResponse {
-						oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
-						//use non-blocking channel to indicate that the download to ONU was successful
-						select {
-						case oFsm.chOnuDlReady <- true:
-						default:
-						}
-					} else {
-						oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
+					//use non-blocking channel (to be independent from receiver state)
+					select {
+					//use non-blocking channel to indicate that the download to ONU was successful
+					case oFsm.chOnuDlReady <- true:
+					default:
 					}
 				} else {
 					oFsm.mutexUpgradeParams.Unlock()
@@ -1401,13 +1399,11 @@
 						oFsm.volthaImageState = voltha.ImageState_IMAGE_UNKNOWN              //something like 'DOWNLOADED' would be better - proto def
 						oFsm.mutexUpgradeParams.Unlock()
 						//stop the running ONU download timer
-						oFsm.mutexIsAwaitingOnuDlResponse.RLock()
-						if oFsm.isWaitingForOnuDlResponse {
-							oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
-							//use channel to indicate that the download to ONU was not successful
-							oFsm.chOnuDlReady <- false
-						} else {
-							oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
+						//use non-blocking channel (to be independent from receiver state)
+						select {
+						//use channel to indicate that the download response waiting shall be aborted for this device (channel)
+						case oFsm.chOnuDlReady <- false:
+						default:
 						}
 						// TODO!!!: error treatment?
 						//TODO!!!: possibly send event information for aborted upgrade (aborted by wrong version)?
@@ -1434,13 +1430,11 @@
 							log.Fields{"device-id": oFsm.deviceID})
 						_ = oFsm.pAdaptFsm.pFsm.Event(upgradeEvWaitForActivate)
 					}
-					oFsm.mutexIsAwaitingOnuDlResponse.RLock()
-					if oFsm.isWaitingForOnuDlResponse {
-						oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
-						//use channel to indicate that the download to ONU was successful
-						oFsm.chOnuDlReady <- true
-					} else {
-						oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
+					//use non-blocking channel (to be independent from receiver state)
+					select {
+					//use non-blocking channel to indicate that the download to ONU was successful
+					case oFsm.chOnuDlReady <- true:
+					default:
 					}
 					return
 				}
@@ -1569,10 +1563,25 @@
 		logger.Warnw(ctx, "OnuUpgradeFsm Waiting-adapter-download timeout", log.Fields{
 			"for device-id": oFsm.deviceID, "image-id": oFsm.imageIdentifier, "timeout": downloadToAdapterTimeout})
 		oFsm.pFileManager.RemoveReadyRequest(ctx, oFsm.imageIdentifier, aWaitChannel)
+		//running into timeout here may still have the download to adapter active -> abort
+		oFsm.pFileManager.CancelDownload(ctx, oFsm.imageIdentifier)
 		oFsm.mutexIsAwaitingAdapterDlResponse.Lock()
 		oFsm.isWaitingForAdapterDlResponse = false
 		oFsm.mutexIsAwaitingAdapterDlResponse.Unlock()
-		oFsm.abortOnOmciError(ctx, true, voltha.ImageState_IMAGE_UNKNOWN) //no ImageState update
+		oFsm.mutexUpgradeParams.Lock()
+		oFsm.conditionalCancelRequested = false //any conditional cancelRequest is superseded by this abortion
+		oFsm.volthaDownloadState = voltha.ImageState_DOWNLOAD_FAILED
+		oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //something like 'DOWNLOAD_TO_ADAPTER_TIMEOUT' would be better (proto)
+		oFsm.volthaImageState = voltha.ImageState_IMAGE_UNKNOWN     //something like 'IMAGE_DOWNLOAD_ABORTED' would be better (proto)
+		oFsm.mutexUpgradeParams.Unlock()
+		//TODO!!!: possibly send event information for aborted upgrade (aborted by omci processing)??
+		if oFsm.pAdaptFsm != nil && oFsm.pAdaptFsm.pFsm != nil {
+			err := oFsm.pAdaptFsm.pFsm.Event(upgradeEvAbort)
+			if err != nil {
+				logger.Warnw(ctx, "onu upgrade fsm could not abort on omci error", log.Fields{
+					"device-id": oFsm.deviceID, "error": err})
+			}
+		}
 		return
 
 	case success := <-aWaitChannel:
@@ -1590,14 +1599,13 @@
 			}
 			return
 		}
-		// waiting was aborted (probably on external request)
+		// waiting was aborted (assumed here to be caused by
+		//   error detection or cancel at download after upgrade FSM reset/abort with according image states set there)
 		logger.Debugw(ctx, "OnuUpgradeFsm Waiting-adapter-download aborted", log.Fields{"device-id": oFsm.deviceID})
 		oFsm.pFileManager.RemoveReadyRequest(ctx, oFsm.imageIdentifier, aWaitChannel)
 		oFsm.mutexIsAwaitingAdapterDlResponse.Lock()
 		oFsm.isWaitingForAdapterDlResponse = false
 		oFsm.mutexIsAwaitingAdapterDlResponse.Unlock()
-		//the upgrade process has to be aborted
-		oFsm.abortOnOmciError(ctx, true, voltha.ImageState_IMAGE_UNKNOWN) //no ImageState update
 		return
 	}
 }
@@ -1607,9 +1615,6 @@
 	downloadToOnuTimeout := time.Duration(1+(oFsm.imageLength/0x400000)) * oFsm.downloadToOnuTimeout4MB
 	logger.Debugw(ctx, "OnuUpgradeFsm start download-to-ONU timer", log.Fields{"device-id": oFsm.deviceID,
 		"duration": downloadToOnuTimeout})
-	oFsm.mutexIsAwaitingOnuDlResponse.Lock()
-	oFsm.isWaitingForOnuDlResponse = true
-	oFsm.mutexIsAwaitingOnuDlResponse.Unlock()
 	select {
 	// maybe be also some outside cancel (but no context modeled for the moment ...)
 	// case <-ctx.Done():
@@ -1617,28 +1622,19 @@
 	case <-time.After(downloadToOnuTimeout): //using an image-size depending timout (in minutes)
 		logger.Warnw(ctx, "OnuUpgradeFsm Waiting-ONU-download timeout", log.Fields{
 			"for device-id": oFsm.deviceID, "image-id": oFsm.imageIdentifier, "timeout": downloadToOnuTimeout})
-		oFsm.mutexIsAwaitingOnuDlResponse.Lock()
-		oFsm.isWaitingForOnuDlResponse = false
-		oFsm.mutexIsAwaitingOnuDlResponse.Unlock()
 		//the upgrade process has to be aborted
-		oFsm.abortOnOmciError(ctx, true, voltha.ImageState_IMAGE_UNKNOWN) //no ImageState update
+		oFsm.abortOnOmciError(ctx, false, voltha.ImageState_IMAGE_UNKNOWN) //no ImageState update
 		return
 
 	case success := <-aWaitChannel:
 		if success {
 			logger.Debugw(ctx, "OnuUpgradeFsm image-downloaded on ONU received", log.Fields{"device-id": oFsm.deviceID})
-			oFsm.mutexIsAwaitingOnuDlResponse.Lock()
-			oFsm.isWaitingForOnuDlResponse = false
-			oFsm.mutexIsAwaitingOnuDlResponse.Unlock()
 			//all fine, let the FSM proceed like defined from the sender of this event
 			return
 		}
 		// waiting was aborted (assumed here to be caused by
-		//   error detection or cancel at download after upgrade FSM reset/abort)
+		//   error detection or cancel at download after upgrade FSM reset/abort with according image states set there)
 		logger.Debugw(ctx, "OnuUpgradeFsm Waiting-ONU-download aborted", log.Fields{"device-id": oFsm.deviceID})
-		oFsm.mutexIsAwaitingOnuDlResponse.Lock()
-		oFsm.isWaitingForOnuDlResponse = false
-		oFsm.mutexIsAwaitingOnuDlResponse.Unlock()
 		return
 	}
 }