[VOL-4307] OpenOnuAdapter OnuImage download to adapter: wrong error indication and possibly adapter crash
Signed-off-by: mpagenko <michael.pagenkopf@adtran.com>
Change-Id: Iffb22de92513d1b24f3409d6fc6a631a6bfb98e1
diff --git a/VERSION b/VERSION
index 1957c93..8e30f8d 100755
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.4.1-dev227
+1.4.1-dev228
diff --git a/internal/pkg/onuadaptercore/file_download_manager.go b/internal/pkg/onuadaptercore/file_download_manager.go
index 00440c1..4793e69 100644
--- a/internal/pkg/onuadaptercore/file_download_manager.go
+++ b/internal/pkg/onuadaptercore/file_download_manager.go
@@ -47,10 +47,12 @@
)
type downloadImageParams struct {
- downloadImageName string
- downloadImageState fileState
- downloadImageLen int64
- downloadImageCrc uint32
+ downloadImageName string
+ downloadImageState fileState
+ downloadImageLen int64
+ downloadImageCrc uint32
+ downloadActive bool
+ downloadContextCancelFn context.CancelFunc
}
type requesterChannelMap map[chan<- bool]struct{} //using an empty structure map for easier (unique) element appending
@@ -239,6 +241,23 @@
return false
}
+//updateDownloadCancel sets context cancel function to be used in case the download is to be aborted
+func (dm *fileDownloadManager) updateDownloadCancel(ctx context.Context,
+ aImageName string, aCancelFn context.CancelFunc) {
+ dm.mutexDownloadImageDsc.Lock()
+ defer dm.mutexDownloadImageDsc.Unlock()
+ for imgKey, dnldImgDsc := range dm.downloadImageDscSlice {
+ if dnldImgDsc.downloadImageName == aImageName {
+ //image found (by name) - need to write changes on the original map
+ dm.downloadImageDscSlice[imgKey].downloadContextCancelFn = aCancelFn
+ dm.downloadImageDscSlice[imgKey].downloadActive = true
+ logger.Debugw(ctx, "downloadContextCancelFn set", log.Fields{
+ "image-name": aImageName})
+ return //can leave directly
+ }
+ }
+}
+
//updateFileState sets the new active (downloaded) file state and informs possibly waiting requesters on this change
func (dm *fileDownloadManager) updateFileState(ctx context.Context, aImageName string, aFileSize int64) {
dm.mutexDownloadImageDsc.Lock()
@@ -246,6 +265,7 @@
for imgKey, dnldImgDsc := range dm.downloadImageDscSlice {
if dnldImgDsc.downloadImageName == aImageName {
//image found (by name) - need to write changes on the original map
+ dm.downloadImageDscSlice[imgKey].downloadActive = false
dm.downloadImageDscSlice[imgKey].downloadImageState = cFileStateDlSucceeded
dm.downloadImageDscSlice[imgKey].downloadImageLen = aFileSize
logger.Debugw(ctx, "imageState download succeeded", log.Fields{
@@ -290,6 +310,12 @@
_ = reqExist.WithContext(ctxExist)
respExist, errExist3 := http.DefaultClient.Do(reqExist)
if errExist3 != nil || respExist.StatusCode != http.StatusOK {
+ if respExist == nil {
+ logger.Errorw(ctx, "http head from url error - no status, aborting", log.Fields{"url": urlBase.String(),
+ "error": errExist3})
+ return fmt.Errorf("http head from url error - no status, aborting: %s, error: %s",
+ aURLCommand, errExist3)
+ }
logger.Infow(ctx, "could not http head from url", log.Fields{"url": urlBase.String(),
"error": errExist3, "status": respExist.StatusCode})
//if head is not supported by server we cannot use this test and just try to continue
@@ -297,7 +323,7 @@
logger.Errorw(ctx, "http head from url: file does not exist here, aborting", log.Fields{"url": urlBase.String(),
"error": errExist3, "status": respExist.StatusCode})
return fmt.Errorf("http head from url: file does not exist here, aborting: %s, error: %s, status: %d",
- aURLCommand, errExist2, respExist.StatusCode)
+ aURLCommand, errExist3, respExist.StatusCode)
}
}
defer func() {
@@ -316,12 +342,18 @@
return
}
ctx, cancel := context.WithDeadline(ctx, time.Now().Add(dm.dlToAdapterTimeout)) //timeout as given from SetDownloadTimeout()
+ dm.updateDownloadCancel(ctx, aFileName, cancel)
defer cancel()
_ = req.WithContext(ctx)
resp, err3 := http.DefaultClient.Do(req)
- if err3 != nil || respExist.StatusCode != http.StatusOK {
- logger.Errorw(ctx, "could not http get from url", log.Fields{"url": urlBase.String(),
- "error": err3, "status": respExist.StatusCode})
+ if err3 != nil || resp.StatusCode != http.StatusOK {
+ if resp == nil {
+ logger.Errorw(ctx, "http get error - no status, aborting", log.Fields{"url": urlBase.String(),
+ "error": err3})
+ } else {
+ logger.Errorw(ctx, "could not http get from url", log.Fields{"url": urlBase.String(),
+ "error": err3, "status": resp.StatusCode})
+ }
dm.removeImage(ctx, aFileName, false) //wo FileSystem access
return
}
@@ -398,9 +430,23 @@
//image not found (by name)
}
-//CancelDownload just wraps removeImage (for possible extensions to e.g. really stop a ongoing download)
+//CancelDownload stops the download and clears all entires concerning this aimageName
func (dm *fileDownloadManager) CancelDownload(ctx context.Context, aImageName string) {
// for the moment that would only support to wait for the download end and remove the image then
// further reactions while still downloading can be considered with some effort, but does it make sense (synchronous load here!)
- dm.removeImage(ctx, aImageName, true)
+ dm.mutexDownloadImageDsc.RLock()
+ for imgKey, dnldImgDsc := range dm.downloadImageDscSlice {
+ if dnldImgDsc.downloadImageName == aImageName {
+ //image found (by name) - need to to check on ongoing download
+ if dm.downloadImageDscSlice[imgKey].downloadActive {
+ //then cancel the download using the context cancel function
+ dm.downloadImageDscSlice[imgKey].downloadContextCancelFn()
+ }
+ //and remove possibly stored traces of this image
+ dm.mutexDownloadImageDsc.RUnlock()
+ go dm.removeImage(ctx, aImageName, true) //including the chance that nothing was yet written to FS, should not matter
+ return //can leave directly
+ }
+ }
+ dm.mutexDownloadImageDsc.RUnlock()
}
diff --git a/internal/pkg/onuadaptercore/omci_onu_upgrade.go b/internal/pkg/onuadaptercore/omci_onu_upgrade.go
index b4c4d35..e687f48 100644
--- a/internal/pkg/onuadaptercore/omci_onu_upgrade.go
+++ b/internal/pkg/onuadaptercore/omci_onu_upgrade.go
@@ -135,9 +135,7 @@
mutexIsAwaitingAdapterDlResponse sync.RWMutex
chAdapterDlReady chan bool
isWaitingForAdapterDlResponse bool
- mutexIsAwaitingOnuDlResponse sync.RWMutex
chOnuDlReady chan bool
- isWaitingForOnuDlResponse bool
activateImage bool
commitImage bool
mutexAbortRequest sync.RWMutex
@@ -596,7 +594,7 @@
}
if err != nil || fileLen > int64(cMaxUint32) {
oFsm.volthaDownloadState = voltha.ImageState_DOWNLOAD_FAILED
- oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //Something like 'LOCAL_FILE_ERROR' would be better (proto)
+ oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //something like 'LOCAL_FILE_ERROR' would be better (proto)
oFsm.volthaImageState = voltha.ImageState_IMAGE_UNKNOWN
oFsm.mutexUpgradeParams.Unlock()
logger.Errorw(ctx, "OnuUpgradeFsm abort: problems getting image buffer length", log.Fields{
@@ -618,7 +616,7 @@
}
if err != nil {
oFsm.volthaDownloadState = voltha.ImageState_DOWNLOAD_FAILED
- oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //Something like 'LOCAL_FILE_ERROR' would be better (proto)
+ oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //something like 'LOCAL_FILE_ERROR' would be better (proto)
oFsm.volthaImageState = voltha.ImageState_IMAGE_UNKNOWN
oFsm.mutexUpgradeParams.Unlock()
logger.Errorw(ctx, "OnuUpgradeFsm abort: can't get image buffer", log.Fields{
@@ -646,6 +644,13 @@
//"NumberOfCircuitPacks": oFsm.numberCircuitPacks, "CircuitPacks MeId": 0}) //parallel circuit packs download not supported
oFsm.mutexUpgradeParams.Unlock()
+
+ // flush commMetricsChan
+ select {
+ case <-oFsm.chOnuDlReady:
+ logger.Debug(ctx, "flushed OnuDlReady channel")
+ default:
+ }
go oFsm.waitOnDownloadToOnuReady(ctx, oFsm.chOnuDlReady) // start supervision of the complete download-to-ONU procedure
err = oFsm.pOmciCC.sendStartSoftwareDownload(log.WithSpanFromContext(context.TODO(), ctx), oFsm.pDeviceHandler.pOpenOnuAc.omciTimeout, false,
@@ -706,7 +711,7 @@
"device-id": oFsm.deviceID, "bufferStartOffset": bufferStartOffset,
"bufferEndOffset": bufferEndOffset, "imageLength": oFsm.imageLength})
oFsm.volthaDownloadState = voltha.ImageState_DOWNLOAD_FAILED
- oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //Something like 'LOCAL_FILE_ERROR' would be better (proto)
+ oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //something like 'LOCAL_FILE_ERROR' would be better (proto)
oFsm.mutexUpgradeParams.Unlock()
//logical error -- reset the FSM
pBaseFsm := oFsm.pAdaptFsm
@@ -821,7 +826,7 @@
}
oFsm.mutexUpgradeParams.Lock()
oFsm.volthaDownloadState = voltha.ImageState_DOWNLOAD_FAILED
- oFsm.volthaDownloadReason = voltha.ImageState_IMAGE_REFUSED_BY_ONU //Something like 'END_DOWNLOAD_TIMEOUT' would be better (proto)
+ oFsm.volthaDownloadReason = voltha.ImageState_IMAGE_REFUSED_BY_ONU //something like 'END_DOWNLOAD_TIMEOUT' would be better (proto)
oFsm.mutexUpgradeParams.Unlock()
go func(a_pAFsm *AdapterFsm) {
_ = a_pAFsm.pFsm.Event(upgradeEvAbort)
@@ -989,13 +994,11 @@
}
// in case the download-to-ONU timer is still running - cancel it
- oFsm.mutexIsAwaitingOnuDlResponse.RLock()
- if oFsm.isWaitingForOnuDlResponse {
- oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
- //use channel to indicate that the download response waiting shall be aborted for this device (channel)
- oFsm.chOnuDlReady <- false
- } else {
- oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
+ //use non-blocking channel (to be independent from receiver state)
+ select {
+ //use channel to indicate that the download response waiting shall be aborted for this device (channel)
+ case oFsm.chOnuDlReady <- false:
+ default:
}
pConfigupgradeStateAFsm := oFsm.pAdaptFsm
@@ -1239,16 +1242,11 @@
// - assume new loaded image as valid-inactive immediately
oFsm.volthaImageState = voltha.ImageState_IMAGE_INACTIVE
oFsm.mutexUpgradeParams.Unlock()
- oFsm.mutexIsAwaitingOnuDlResponse.RLock()
- if oFsm.isWaitingForOnuDlResponse {
- oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
- //use non-blocking channel to indicate that the download to ONU was successful
- select {
- case oFsm.chOnuDlReady <- true:
- default:
- }
- } else {
- oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
+ //use non-blocking channel (to be independent from receiver state)
+ select {
+ //use non-blocking channel to indicate that the download to ONU was successful
+ case oFsm.chOnuDlReady <- true:
+ default:
}
} else {
oFsm.mutexUpgradeParams.Unlock()
@@ -1401,13 +1399,11 @@
oFsm.volthaImageState = voltha.ImageState_IMAGE_UNKNOWN //something like 'DOWNLOADED' would be better - proto def
oFsm.mutexUpgradeParams.Unlock()
//stop the running ONU download timer
- oFsm.mutexIsAwaitingOnuDlResponse.RLock()
- if oFsm.isWaitingForOnuDlResponse {
- oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
- //use channel to indicate that the download to ONU was not successful
- oFsm.chOnuDlReady <- false
- } else {
- oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
+ //use non-blocking channel (to be independent from receiver state)
+ select {
+ //use channel to indicate that the download response waiting shall be aborted for this device (channel)
+ case oFsm.chOnuDlReady <- false:
+ default:
}
// TODO!!!: error treatment?
//TODO!!!: possibly send event information for aborted upgrade (aborted by wrong version)?
@@ -1434,13 +1430,11 @@
log.Fields{"device-id": oFsm.deviceID})
_ = oFsm.pAdaptFsm.pFsm.Event(upgradeEvWaitForActivate)
}
- oFsm.mutexIsAwaitingOnuDlResponse.RLock()
- if oFsm.isWaitingForOnuDlResponse {
- oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
- //use channel to indicate that the download to ONU was successful
- oFsm.chOnuDlReady <- true
- } else {
- oFsm.mutexIsAwaitingOnuDlResponse.RUnlock()
+ //use non-blocking channel (to be independent from receiver state)
+ select {
+ //use non-blocking channel to indicate that the download to ONU was successful
+ case oFsm.chOnuDlReady <- true:
+ default:
}
return
}
@@ -1569,10 +1563,25 @@
logger.Warnw(ctx, "OnuUpgradeFsm Waiting-adapter-download timeout", log.Fields{
"for device-id": oFsm.deviceID, "image-id": oFsm.imageIdentifier, "timeout": downloadToAdapterTimeout})
oFsm.pFileManager.RemoveReadyRequest(ctx, oFsm.imageIdentifier, aWaitChannel)
+ //running into timeout here may still have the download to adapter active -> abort
+ oFsm.pFileManager.CancelDownload(ctx, oFsm.imageIdentifier)
oFsm.mutexIsAwaitingAdapterDlResponse.Lock()
oFsm.isWaitingForAdapterDlResponse = false
oFsm.mutexIsAwaitingAdapterDlResponse.Unlock()
- oFsm.abortOnOmciError(ctx, true, voltha.ImageState_IMAGE_UNKNOWN) //no ImageState update
+ oFsm.mutexUpgradeParams.Lock()
+ oFsm.conditionalCancelRequested = false //any conditional cancelRequest is superseded by this abortion
+ oFsm.volthaDownloadState = voltha.ImageState_DOWNLOAD_FAILED
+ oFsm.volthaDownloadReason = voltha.ImageState_UNKNOWN_ERROR //something like 'DOWNLOAD_TO_ADAPTER_TIMEOUT' would be better (proto)
+ oFsm.volthaImageState = voltha.ImageState_IMAGE_UNKNOWN //something like 'IMAGE_DOWNLOAD_ABORTED' would be better (proto)
+ oFsm.mutexUpgradeParams.Unlock()
+ //TODO!!!: possibly send event information for aborted upgrade (aborted by omci processing)??
+ if oFsm.pAdaptFsm != nil && oFsm.pAdaptFsm.pFsm != nil {
+ err := oFsm.pAdaptFsm.pFsm.Event(upgradeEvAbort)
+ if err != nil {
+ logger.Warnw(ctx, "onu upgrade fsm could not abort on omci error", log.Fields{
+ "device-id": oFsm.deviceID, "error": err})
+ }
+ }
return
case success := <-aWaitChannel:
@@ -1590,14 +1599,13 @@
}
return
}
- // waiting was aborted (probably on external request)
+ // waiting was aborted (assumed here to be caused by
+ // error detection or cancel at download after upgrade FSM reset/abort with according image states set there)
logger.Debugw(ctx, "OnuUpgradeFsm Waiting-adapter-download aborted", log.Fields{"device-id": oFsm.deviceID})
oFsm.pFileManager.RemoveReadyRequest(ctx, oFsm.imageIdentifier, aWaitChannel)
oFsm.mutexIsAwaitingAdapterDlResponse.Lock()
oFsm.isWaitingForAdapterDlResponse = false
oFsm.mutexIsAwaitingAdapterDlResponse.Unlock()
- //the upgrade process has to be aborted
- oFsm.abortOnOmciError(ctx, true, voltha.ImageState_IMAGE_UNKNOWN) //no ImageState update
return
}
}
@@ -1607,9 +1615,6 @@
downloadToOnuTimeout := time.Duration(1+(oFsm.imageLength/0x400000)) * oFsm.downloadToOnuTimeout4MB
logger.Debugw(ctx, "OnuUpgradeFsm start download-to-ONU timer", log.Fields{"device-id": oFsm.deviceID,
"duration": downloadToOnuTimeout})
- oFsm.mutexIsAwaitingOnuDlResponse.Lock()
- oFsm.isWaitingForOnuDlResponse = true
- oFsm.mutexIsAwaitingOnuDlResponse.Unlock()
select {
// maybe be also some outside cancel (but no context modeled for the moment ...)
// case <-ctx.Done():
@@ -1617,28 +1622,19 @@
case <-time.After(downloadToOnuTimeout): //using an image-size depending timout (in minutes)
logger.Warnw(ctx, "OnuUpgradeFsm Waiting-ONU-download timeout", log.Fields{
"for device-id": oFsm.deviceID, "image-id": oFsm.imageIdentifier, "timeout": downloadToOnuTimeout})
- oFsm.mutexIsAwaitingOnuDlResponse.Lock()
- oFsm.isWaitingForOnuDlResponse = false
- oFsm.mutexIsAwaitingOnuDlResponse.Unlock()
//the upgrade process has to be aborted
- oFsm.abortOnOmciError(ctx, true, voltha.ImageState_IMAGE_UNKNOWN) //no ImageState update
+ oFsm.abortOnOmciError(ctx, false, voltha.ImageState_IMAGE_UNKNOWN) //no ImageState update
return
case success := <-aWaitChannel:
if success {
logger.Debugw(ctx, "OnuUpgradeFsm image-downloaded on ONU received", log.Fields{"device-id": oFsm.deviceID})
- oFsm.mutexIsAwaitingOnuDlResponse.Lock()
- oFsm.isWaitingForOnuDlResponse = false
- oFsm.mutexIsAwaitingOnuDlResponse.Unlock()
//all fine, let the FSM proceed like defined from the sender of this event
return
}
// waiting was aborted (assumed here to be caused by
- // error detection or cancel at download after upgrade FSM reset/abort)
+ // error detection or cancel at download after upgrade FSM reset/abort with according image states set there)
logger.Debugw(ctx, "OnuUpgradeFsm Waiting-ONU-download aborted", log.Fields{"device-id": oFsm.deviceID})
- oFsm.mutexIsAwaitingOnuDlResponse.Lock()
- oFsm.isWaitingForOnuDlResponse = false
- oFsm.mutexIsAwaitingOnuDlResponse.Unlock()
return
}
}