[VOL-4307] OpenOnuAdapter OnuImage download to adapter: wrong error indication and possibly adapter crash

Signed-off-by: mpagenko <michael.pagenkopf@adtran.com>
Change-Id: Iffb22de92513d1b24f3409d6fc6a631a6bfb98e1
diff --git a/VERSION b/VERSION
index 1957c93..8e30f8d 100755
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.4.1-dev227
+1.4.1-dev228
diff --git a/internal/pkg/onuadaptercore/file_download_manager.go b/internal/pkg/onuadaptercore/file_download_manager.go
index 00440c1..4793e69 100644
--- a/internal/pkg/onuadaptercore/file_download_manager.go
+++ b/internal/pkg/onuadaptercore/file_download_manager.go
@@ -47,10 +47,12 @@
 )
 
 type downloadImageParams struct {
-	downloadImageName  string
-	downloadImageState fileState
-	downloadImageLen   int64
-	downloadImageCrc   uint32
+	downloadImageName       string
+	downloadImageState      fileState
+	downloadImageLen        int64
+	downloadImageCrc        uint32
+	downloadActive          bool
+	downloadContextCancelFn context.CancelFunc
 }
 
 type requesterChannelMap map[chan<- bool]struct{} //using an empty structure map for easier (unique) element appending
@@ -239,6 +241,23 @@
 	return false
 }
 
+//updateDownloadCancel sets context cancel function to be used in case the download is to be aborted
+func (dm *fileDownloadManager) updateDownloadCancel(ctx context.Context,
+	aImageName string, aCancelFn context.CancelFunc) {
+	dm.mutexDownloadImageDsc.Lock()
+	defer dm.mutexDownloadImageDsc.Unlock()
+	for imgKey, dnldImgDsc := range dm.downloadImageDscSlice {
+		if dnldImgDsc.downloadImageName == aImageName {
+			//image found (by name) - need to write changes on the original map
+			dm.downloadImageDscSlice[imgKey].downloadContextCancelFn = aCancelFn
+			dm.downloadImageDscSlice[imgKey].downloadActive = true
+			logger.Debugw(ctx, "downloadContextCancelFn set", log.Fields{
+				"image-name": aImageName})
+			return //can leave directly
+		}
+	}
+}
+
 //updateFileState sets the new active (downloaded) file state and informs possibly waiting requesters on this change
 func (dm *fileDownloadManager) updateFileState(ctx context.Context, aImageName string, aFileSize int64) {
 	dm.mutexDownloadImageDsc.Lock()
@@ -246,6 +265,7 @@
 	for imgKey, dnldImgDsc := range dm.downloadImageDscSlice {
 		if dnldImgDsc.downloadImageName == aImageName {
 			//image found (by name) - need to write changes on the original map
+			dm.downloadImageDscSlice[imgKey].downloadActive = false
 			dm.downloadImageDscSlice[imgKey].downloadImageState = cFileStateDlSucceeded
 			dm.downloadImageDscSlice[imgKey].downloadImageLen = aFileSize
 			logger.Debugw(ctx, "imageState download succeeded", log.Fields{
@@ -290,6 +310,12 @@
 	_ = reqExist.WithContext(ctxExist)
 	respExist, errExist3 := http.DefaultClient.Do(reqExist)
 	if errExist3 != nil || respExist.StatusCode != http.StatusOK {
+		if respExist == nil {
+			logger.Errorw(ctx, "http head from url error - no status, aborting", log.Fields{"url": urlBase.String(),
+				"error": errExist3})
+			return fmt.Errorf("http head from url error - no status, aborting: %s, error: %s",
+				aURLCommand, errExist3)
+		}
 		logger.Infow(ctx, "could not http head from url", log.Fields{"url": urlBase.String(),
 			"error": errExist3, "status": respExist.StatusCode})
 		//if head is not supported by server we cannot use this test and just try to continue
@@ -297,7 +323,7 @@
 			logger.Errorw(ctx, "http head from url: file does not exist here, aborting", log.Fields{"url": urlBase.String(),
 				"error": errExist3, "status": respExist.StatusCode})
 			return fmt.Errorf("http head from url: file does not exist here, aborting: %s, error: %s, status: %d",
-				aURLCommand, errExist2, respExist.StatusCode)
+				aURLCommand, errExist3, respExist.StatusCode)
 		}
 	}
 	defer func() {
@@ -316,12 +342,18 @@
 			return
 		}
 		ctx, cancel := context.WithDeadline(ctx, time.Now().Add(dm.dlToAdapterTimeout)) //timeout as given from SetDownloadTimeout()
+		dm.updateDownloadCancel(ctx, aFileName, cancel)
 		defer cancel()
 		_ = req.WithContext(ctx)
 		resp, err3 := http.DefaultClient.Do(req)
-		if err3 != nil || respExist.StatusCode != http.StatusOK {
-			logger.Errorw(ctx, "could not http get from url", log.Fields{"url": urlBase.String(),
-				"error": err3, "status": respExist.StatusCode})
+		if err3 != nil || resp.StatusCode != http.StatusOK {
+			if resp == nil {
+				logger.Errorw(ctx, "http get error - no status, aborting", log.Fields{"url": urlBase.String(),
+					"error": err3})
+			} else {
+				logger.Errorw(ctx, "could not http get from url", log.Fields{"url": urlBase.String(),
+					"error": err3, "status": resp.StatusCode})
+			}
 			dm.removeImage(ctx, aFileName, false) //wo FileSystem access
 			return
 		}
@@ -398,9 +430,23 @@
 	//image not found (by name)
 }
 
-//CancelDownload just wraps removeImage (for possible extensions to e.g. really stop a ongoing download)
+//CancelDownload stops the download and clears all entires concerning this aimageName
 func (dm *fileDownloadManager) CancelDownload(ctx context.Context, aImageName string) {
 	// for the moment that would only support to wait for the download end and remove the image then
 	//   further reactions while still downloading can be considered with some effort, but does it make sense (synchronous load here!)
-	dm.removeImage(ctx, aImageName, true)
+	dm.mutexDownloadImageDsc.RLock()
+	for imgKey, dnldImgDsc := range dm.downloadImageDscSlice {
+		if dnldImgDsc.downloadImageName == aImageName {
+			//image found (by name) - need to to check on ongoing download
+			if dm.downloadImageDscSlice[imgKey].downloadActive {
+				//then cancel the download using the context cancel function
+				dm.downloadImageDscSlice[imgKey].downloadContextCancelFn()
+			}
+			//and remove possibly stored traces of this image
+			dm.mutexDownloadImageDsc.RUnlock()
+			go dm.removeImage(ctx, aImageName, true) //including the chance that nothing was yet written to FS, should not matter
+			return                                   //can leave directly
+		}
+	}
+	dm.mutexDownloadImageDsc.RUnlock()
 }
diff --git a/internal/pkg/onuadaptercore/omci_onu_upgrade.go b/internal/pkg/onuadaptercore/omci_onu_upgrade.go
index b4c4d35..e687f48 100644
--- a/internal/pkg/onuadaptercore/omci_onu_upgrade.go
+++ b/internal/pkg/onuadaptercore/omci_onu_upgrade.go
@@ -135,9 +135,7 @@
 	mutexIsAwaitingAdapterDlResponse sync.RWMutex
 	chAdapterDlReady                 chan bool
 	isWaitingForAdapterDlResponse    bool
-	mutexIsAwaitingOnuDlResponse     sync.RWMutex
 	chOnuDlReady                     chan bool
-	isWaitingForOnuDlResponse        bool
 	activateImage                    bool
 	commitImage                      bool
 	mutexAbortRequest                sync.RWMutex
@@ -596,7 +594,7 @@
 	}
 	if err != nil || fileLen > int64(cMaxUint32) {
 		oFsm.volthaDownloadState = voltha.ImageState_DOWNLOAD_FAILED
-		oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //Something like 'LOCAL_FILE_ERROR' would be better (proto)
+		oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //something like 'LOCAL_FILE_ERROR' would be better (proto)
 		oFsm.volthaImageState = voltha.ImageState_IMAGE_UNKNOWN
 		oFsm.mutexUpgradeParams.Unlock()
 		logger.Errorw(ctx, "OnuUpgradeFsm abort: problems getting image buffer length", log.Fields{
@@ -618,7 +616,7 @@
 	}
 	if err != nil {
 		oFsm.volthaDownloadState = voltha.ImageState_DOWNLOAD_FAILED
-		oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //Something like 'LOCAL_FILE_ERROR' would be better (proto)
+		oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //something like 'LOCAL_FILE_ERROR' would be better (proto)
 		oFsm.volthaImageState = voltha.ImageState_IMAGE_UNKNOWN
 		oFsm.mutexUpgradeParams.Unlock()
 		logger.Errorw(ctx, "OnuUpgradeFsm abort: can't get image buffer", log.Fields{
@@ -646,6 +644,13 @@
 	//"NumberOfCircuitPacks": oFsm.numberCircuitPacks, "CircuitPacks MeId": 0}) //parallel circuit packs download not supported
 
 	oFsm.mutexUpgradeParams.Unlock()
+
+	// flush commMetricsChan
+	select {
+	case <-oFsm.chOnuDlReady:
+		logger.Debug(ctx, "flushed OnuDlReady channel")
+	default:
+	}
 	go oFsm.waitOnDownloadToOnuReady(ctx, oFsm.chOnuDlReady) // start supervision of the complete download-to-ONU procedure
 
 	err = oFsm.pOmciCC.sendStartSoftwareDownload(log.WithSpanFromContext(context.TODO(), ctx), oFsm.pDeviceHandler.pOpenOnuAc.omciTimeout, false,
@@ -706,7 +711,7 @@
 				"device-id": oFsm.deviceID, "bufferStartOffset": bufferStartOffset,
 				"bufferEndOffset": bufferEndOffset, "imageLength": oFsm.imageLength})
 			oFsm.volthaDownloadState = voltha.ImageState_DOWNLOAD_FAILED
-			oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //Something like 'LOCAL_FILE_ERROR' would be better (proto)
+			oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //something like 'LOCAL_FILE_ERROR' would be better (proto)
 			oFsm.mutexUpgradeParams.Unlock()
 			//logical error -- reset the FSM
 			pBaseFsm := oFsm.pAdaptFsm
@@ -821,7 +826,7 @@
 		}
 		oFsm.mutexUpgradeParams.Lock()
 		oFsm.volthaDownloadState = voltha.ImageState_DOWNLOAD_FAILED
-		oFsm.volthaDownloadReason = voltha.ImageState_IMAGE_REFUSED_BY_ONU //Something like 'END_DOWNLOAD_TIMEOUT' would be better (proto)
+		oFsm.volthaDownloadReason = voltha.ImageState_IMAGE_REFUSED_BY_ONU //something like 'END_DOWNLOAD_TIMEOUT' would be better (proto)
 		oFsm.mutexUpgradeParams.Unlock()
 		go func(a_pAFsm *AdapterFsm) {
 			_ = a_pAFsm.pFsm.Event(upgradeEvAbort)
@@ -989,13 +994,11 @@
 	}
 
 	// in case the download-to-ONU timer is still running - cancel it
-	oFsm.mutexIsAwaitingOnuDlResponse.RLock()
-	if oFsm.isWaitingForOnuDlResponse {
-		oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
-		//use channel to indicate that the download response waiting shall be aborted for this device (channel)
-		oFsm.chOnuDlReady <- false
-	} else {
-		oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
+	//use non-blocking channel (to be independent from receiver state)
+	select {
+	//use channel to indicate that the download response waiting shall be aborted for this device (channel)
+	case oFsm.chOnuDlReady <- false:
+	default:
 	}
 
 	pConfigupgradeStateAFsm := oFsm.pAdaptFsm
@@ -1239,16 +1242,11 @@
 					//  - assume new loaded image as valid-inactive immediately
 					oFsm.volthaImageState = voltha.ImageState_IMAGE_INACTIVE
 					oFsm.mutexUpgradeParams.Unlock()
-					oFsm.mutexIsAwaitingOnuDlResponse.RLock()
-					if oFsm.isWaitingForOnuDlResponse {
-						oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
-						//use non-blocking channel to indicate that the download to ONU was successful
-						select {
-						case oFsm.chOnuDlReady <- true:
-						default:
-						}
-					} else {
-						oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
+					//use non-blocking channel (to be independent from receiver state)
+					select {
+					//use non-blocking channel to indicate that the download to ONU was successful
+					case oFsm.chOnuDlReady <- true:
+					default:
 					}
 				} else {
 					oFsm.mutexUpgradeParams.Unlock()
@@ -1401,13 +1399,11 @@
 						oFsm.volthaImageState = voltha.ImageState_IMAGE_UNKNOWN              //something like 'DOWNLOADED' would be better - proto def
 						oFsm.mutexUpgradeParams.Unlock()
 						//stop the running ONU download timer
-						oFsm.mutexIsAwaitingOnuDlResponse.RLock()
-						if oFsm.isWaitingForOnuDlResponse {
-							oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
-							//use channel to indicate that the download to ONU was not successful
-							oFsm.chOnuDlReady <- false
-						} else {
-							oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
+						//use non-blocking channel (to be independent from receiver state)
+						select {
+						//use channel to indicate that the download response waiting shall be aborted for this device (channel)
+						case oFsm.chOnuDlReady <- false:
+						default:
 						}
 						// TODO!!!: error treatment?
 						//TODO!!!: possibly send event information for aborted upgrade (aborted by wrong version)?
@@ -1434,13 +1430,11 @@
 							log.Fields{"device-id": oFsm.deviceID})
 						_ = oFsm.pAdaptFsm.pFsm.Event(upgradeEvWaitForActivate)
 					}
-					oFsm.mutexIsAwaitingOnuDlResponse.RLock()
-					if oFsm.isWaitingForOnuDlResponse {
-						oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
-						//use channel to indicate that the download to ONU was successful
-						oFsm.chOnuDlReady <- true
-					} else {
-						oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
+					//use non-blocking channel (to be independent from receiver state)
+					select {
+					//use non-blocking channel to indicate that the download to ONU was successful
+					case oFsm.chOnuDlReady <- true:
+					default:
 					}
 					return
 				}
@@ -1569,10 +1563,25 @@
 		logger.Warnw(ctx, "OnuUpgradeFsm Waiting-adapter-download timeout", log.Fields{
 			"for device-id": oFsm.deviceID, "image-id": oFsm.imageIdentifier, "timeout": downloadToAdapterTimeout})
 		oFsm.pFileManager.RemoveReadyRequest(ctx, oFsm.imageIdentifier, aWaitChannel)
+		//running into timeout here may still have the download to adapter active -> abort
+		oFsm.pFileManager.CancelDownload(ctx, oFsm.imageIdentifier)
 		oFsm.mutexIsAwaitingAdapterDlResponse.Lock()
 		oFsm.isWaitingForAdapterDlResponse = false
 		oFsm.mutexIsAwaitingAdapterDlResponse.Unlock()
-		oFsm.abortOnOmciError(ctx, true, voltha.ImageState_IMAGE_UNKNOWN) //no ImageState update
+		oFsm.mutexUpgradeParams.Lock()
+		oFsm.conditionalCancelRequested = false //any conditional cancelRequest is superseded by this abortion
+		oFsm.volthaDownloadState = voltha.ImageState_DOWNLOAD_FAILED
+		oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //something like 'DOWNLOAD_TO_ADAPTER_TIMEOUT' would be better (proto)
+		oFsm.volthaImageState = voltha.ImageState_IMAGE_UNKNOWN     //something like 'IMAGE_DOWNLOAD_ABORTED' would be better (proto)
+		oFsm.mutexUpgradeParams.Unlock()
+		//TODO!!!: possibly send event information for aborted upgrade (aborted by omci processing)??
+		if oFsm.pAdaptFsm != nil && oFsm.pAdaptFsm.pFsm != nil {
+			err := oFsm.pAdaptFsm.pFsm.Event(upgradeEvAbort)
+			if err != nil {
+				logger.Warnw(ctx, "onu upgrade fsm could not abort on omci error", log.Fields{
+					"device-id": oFsm.deviceID, "error": err})
+			}
+		}
 		return
 
 	case success := <-aWaitChannel:
@@ -1590,14 +1599,13 @@
 			}
 			return
 		}
-		// waiting was aborted (probably on external request)
+		// waiting was aborted (assumed here to be caused by
+		//   error detection or cancel at download after upgrade FSM reset/abort with according image states set there)
 		logger.Debugw(ctx, "OnuUpgradeFsm Waiting-adapter-download aborted", log.Fields{"device-id": oFsm.deviceID})
 		oFsm.pFileManager.RemoveReadyRequest(ctx, oFsm.imageIdentifier, aWaitChannel)
 		oFsm.mutexIsAwaitingAdapterDlResponse.Lock()
 		oFsm.isWaitingForAdapterDlResponse = false
 		oFsm.mutexIsAwaitingAdapterDlResponse.Unlock()
-		//the upgrade process has to be aborted
-		oFsm.abortOnOmciError(ctx, true, voltha.ImageState_IMAGE_UNKNOWN) //no ImageState update
 		return
 	}
 }
@@ -1607,9 +1615,6 @@
 	downloadToOnuTimeout := time.Duration(1+(oFsm.imageLength/0x400000)) * oFsm.downloadToOnuTimeout4MB
 	logger.Debugw(ctx, "OnuUpgradeFsm start download-to-ONU timer", log.Fields{"device-id": oFsm.deviceID,
 		"duration": downloadToOnuTimeout})
-	oFsm.mutexIsAwaitingOnuDlResponse.Lock()
-	oFsm.isWaitingForOnuDlResponse = true
-	oFsm.mutexIsAwaitingOnuDlResponse.Unlock()
 	select {
 	// maybe be also some outside cancel (but no context modeled for the moment ...)
 	// case <-ctx.Done():
@@ -1617,28 +1622,19 @@
 	case <-time.After(downloadToOnuTimeout): //using an image-size depending timout (in minutes)
 		logger.Warnw(ctx, "OnuUpgradeFsm Waiting-ONU-download timeout", log.Fields{
 			"for device-id": oFsm.deviceID, "image-id": oFsm.imageIdentifier, "timeout": downloadToOnuTimeout})
-		oFsm.mutexIsAwaitingOnuDlResponse.Lock()
-		oFsm.isWaitingForOnuDlResponse = false
-		oFsm.mutexIsAwaitingOnuDlResponse.Unlock()
 		//the upgrade process has to be aborted
-		oFsm.abortOnOmciError(ctx, true, voltha.ImageState_IMAGE_UNKNOWN) //no ImageState update
+		oFsm.abortOnOmciError(ctx, false, voltha.ImageState_IMAGE_UNKNOWN) //no ImageState update
 		return
 
 	case success := <-aWaitChannel:
 		if success {
 			logger.Debugw(ctx, "OnuUpgradeFsm image-downloaded on ONU received", log.Fields{"device-id": oFsm.deviceID})
-			oFsm.mutexIsAwaitingOnuDlResponse.Lock()
-			oFsm.isWaitingForOnuDlResponse = false
-			oFsm.mutexIsAwaitingOnuDlResponse.Unlock()
 			//all fine, let the FSM proceed like defined from the sender of this event
 			return
 		}
 		// waiting was aborted (assumed here to be caused by
-		//   error detection or cancel at download after upgrade FSM reset/abort)
+		//   error detection or cancel at download after upgrade FSM reset/abort with according image states set there)
 		logger.Debugw(ctx, "OnuUpgradeFsm Waiting-ONU-download aborted", log.Fields{"device-id": oFsm.deviceID})
-		oFsm.mutexIsAwaitingOnuDlResponse.Lock()
-		oFsm.isWaitingForOnuDlResponse = false
-		oFsm.mutexIsAwaitingOnuDlResponse.Unlock()
 		return
 	}
 }