[VOL-5446] implementation of retry logic for processing errors during software upgrade

Change-Id: Ida41fc14e27979d4f17ddbe0cca0c1adacaa34fc
Signed-off-by: mgouda <madhumati.gouda@radisys.com>
diff --git a/VERSION b/VERSION
index 4f63ee5..9d44043 100755
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.12.18
+2.12.19
diff --git a/internal/pkg/swupg/omci_onu_upgrade.go b/internal/pkg/swupg/omci_onu_upgrade.go
index 5c54762..3ef4612 100755
--- a/internal/pkg/swupg/omci_onu_upgrade.go
+++ b/internal/pkg/swupg/omci_onu_upgrade.go
@@ -50,6 +50,7 @@
 	cWaitCountEndSwDl                  = 6  //maximum number of EndSwDl requests
 	cWaitDelayEndSwDlSeconds           = 10 //duration, how long is waited before next request on EndSwDl
 	//cOmciDownloadCompleteTimeout = 5400 //in s for the complete timeout (may be better scale to image size/ noOfWindows)
+	cMaxRetryAttemptsForDownloadWindow = 3 //maximun number of retry attempts to download the window
 )
 
 // tEndSwDlResponseResult - Response result from EndSwDownload as used in channel indication
@@ -84,6 +85,10 @@
 	cEndSwDlResponseAbort
 )
 
+const (
+	ResultErr = "result error"
+)
+
 // upgrade FSM related events
 const (
 	UpgradeEvStart              = "UpgradeEvStart"
@@ -146,52 +151,54 @@
 	pOmciCC          *cmn.OmciCC
 	pOnuDB           *devdb.OnuDeviceDB
 	//omciMIdsResponseReceived chan bool //seperate channel needed for checking multiInstance OMCI message responses
-	PAdaptFsm                        *cmn.AdapterFsm
-	pImageDsc                        *voltha.ImageDownload
-	pLastTxMeInstance                *me.ManagedEntity
-	chReceiveExpectedResponse        chan bool
-	chAdapterDlReady                 chan bool
-	chAbortDelayEndSwDl              chan struct{}
-	chOnuDlReady                     chan bool
-	chReceiveAbortEndSwDlResponse    chan tEndSwDlResponseResult
-	deviceID                         string
-	imageVersion                     string //name of the image as used within OMCI (and on extrenal API interface)
-	imageIdentifier                  string //name of the image as used in the adapter
-	imageBuffer                      []byte
-	requestEvent                     cmn.OnuDeviceEvent
-	downloadToOnuTimeout4MB          time.Duration //timeout for downloading the image to the ONU for a 4MB image slice
-	omciSectionInterleaveDelay       time.Duration //DownloadSectionInterleave delay in milliseconds
-	waitDelayEndSwDl                 time.Duration //duration, how long is waited before next request on EndSwDl
-	omciDownloadSectionSize          int64
-	mutexUpgradeParams               sync.RWMutex //mutex to protect members for parallel function requests and omci response processing
-	mutexIsAwaitingAdapterDlResponse sync.RWMutex
-	mutexAbortRequest                sync.RWMutex
-	origImageLength                  uint32 //as also limited by OMCI
-	imageCRC                         uint32 //as per OMCI - ITU I.363.5 crc
-	imageLength                      uint32 //including last bytes padding
-	noOfSections                     uint32 //uint32 range for sections should be sufficient for very long images
-	nextDownloadSectionsAbsolute     uint32 //number of next section to download in overall image
-	noOfWindows                      uint32 //uint32 range for windows should be sufficient for very long images
-	nextDownloadWindow               uint32 //number of next window to download
-	abortRequested                   voltha.ImageState_ImageFailureReason
-	volthaDownloadState              voltha.ImageState_ImageDownloadState
-	volthaDownloadReason             voltha.ImageState_ImageFailureReason
-	volthaImageState                 voltha.ImageState_ImageActivationState
-	InactiveImageMeID                uint16 //ME-ID of the inactive image
-	omciDownloadWindowSizeLimit      uint8  //windowSize-1 in sections
-	omciDownloadWindowSizeLast       uint8  //number of sections in last window
-	nextDownloadSectionsWindow       uint8  //number of next section to download within current window
-	delayEndSwDl                     bool   //flag to provide a delay between last section and EndSwDl
-	repeatAbort                      bool   //flag to indicate if OMCI EndSwDownload (abort) is to be repeated
-	waitCountEndSwDl                 uint8  //number, how often is waited for EndSwDl at maximum
-	useAPIVersion43                  bool   //flag for indication on which API version is used (and accordingly which specific methods)
-	isWaitingForAdapterDlResponse    bool
-	activateImage                    bool
-	commitImage                      bool
-	conditionalCancelRequested       bool
-	upgradePhase                     tUpgradePhase
-	isEndSwDlOpen                    bool
-	isExtendedOmci                   bool
+	PAdaptFsm                         *cmn.AdapterFsm
+	pImageDsc                         *voltha.ImageDownload
+	pLastTxMeInstance                 *me.ManagedEntity
+	chReceiveExpectedResponse         chan bool
+	chAdapterDlReady                  chan bool
+	chAbortDelayEndSwDl               chan struct{}
+	chOnuDlReady                      chan bool
+	chReceiveAbortEndSwDlResponse     chan tEndSwDlResponseResult
+	deviceID                          string
+	imageVersion                      string //name of the image as used within OMCI (and on extrenal API interface)
+	imageIdentifier                   string //name of the image as used in the adapter
+	currentErrState                   string
+	imageBuffer                       []byte
+	requestEvent                      cmn.OnuDeviceEvent
+	downloadToOnuTimeout4MB           time.Duration //timeout for downloading the image to the ONU for a 4MB image slice
+	omciSectionInterleaveDelay        time.Duration //DownloadSectionInterleave delay in milliseconds
+	waitDelayEndSwDl                  time.Duration //duration, how long is waited before next request on EndSwDl
+	omciDownloadSectionSize           int64
+	mutexUpgradeParams                sync.RWMutex //mutex to protect members for parallel function requests and omci response processing
+	mutexIsAwaitingAdapterDlResponse  sync.RWMutex
+	mutexAbortRequest                 sync.RWMutex
+	origImageLength                   uint32 //as also limited by OMCI
+	imageCRC                          uint32 //as per OMCI - ITU I.363.5 crc
+	imageLength                       uint32 //including last bytes padding
+	noOfSections                      uint32 //uint32 range for sections should be sufficient for very long images
+	nextDownloadSectionsAbsolute      uint32 //number of next section to download in overall image
+	noOfWindows                       uint32 //uint32 range for windows should be sufficient for very long images
+	nextDownloadWindow                uint32 //number of next window to download
+	abortRequested                    voltha.ImageState_ImageFailureReason
+	volthaDownloadState               voltha.ImageState_ImageDownloadState
+	volthaDownloadReason              voltha.ImageState_ImageFailureReason
+	volthaImageState                  voltha.ImageState_ImageActivationState
+	InactiveImageMeID                 uint16 //ME-ID of the inactive image
+	omciDownloadWindowSizeLimit       uint8  //windowSize-1 in sections
+	omciDownloadWindowSizeLast        uint8  //number of sections in last window
+	nextDownloadSectionsWindow        uint8  //number of next section to download within current window
+	maxRetryAttemptsForDownloadWindow uint8
+	delayEndSwDl                      bool  //flag to provide a delay between last section and EndSwDl
+	repeatAbort                       bool  //flag to indicate if OMCI EndSwDownload (abort) is to be repeated
+	waitCountEndSwDl                  uint8 //number, how often is waited for EndSwDl at maximum
+	useAPIVersion43                   bool  //flag for indication on which API version is used (and accordingly which specific methods)
+	isWaitingForAdapterDlResponse     bool
+	activateImage                     bool
+	commitImage                       bool
+	conditionalCancelRequested        bool
+	upgradePhase                      tUpgradePhase
+	isEndSwDlOpen                     bool
+	isExtendedOmci                    bool
 }
 
 // NewOnuUpgradeFsm is the 'constructor' for the state machine to config the PON ANI ports
@@ -201,21 +208,22 @@
 	apDevEntry cmn.IonuDeviceEntry, apOnuDB *devdb.OnuDeviceDB,
 	aRequestEvent cmn.OnuDeviceEvent, aName string, aCommChannel chan cmn.Message) *OnuUpgradeFsm {
 	instFsm := &OnuUpgradeFsm{
-		pDeviceHandler:             apDeviceHandler,
-		deviceID:                   apDeviceHandler.GetDeviceID(),
-		pDevEntry:                  apDevEntry,
-		pOmciCC:                    apDevEntry.GetDevOmciCC(),
-		pOnuDB:                     apOnuDB,
-		requestEvent:               aRequestEvent,
-		omciSectionInterleaveDelay: cOmciSectionInterleaveMilliseconds,
-		downloadToOnuTimeout4MB:    apDeviceHandler.GetDlToOnuTimeout4M(),
-		waitCountEndSwDl:           cWaitCountEndSwDl,
-		waitDelayEndSwDl:           cWaitDelayEndSwDlSeconds,
-		upgradePhase:               cUpgradeUndefined,
-		volthaDownloadState:        voltha.ImageState_DOWNLOAD_UNKNOWN,
-		volthaDownloadReason:       voltha.ImageState_NO_ERROR,
-		volthaImageState:           voltha.ImageState_IMAGE_UNKNOWN,
-		abortRequested:             voltha.ImageState_NO_ERROR,
+		pDeviceHandler:                    apDeviceHandler,
+		deviceID:                          apDeviceHandler.GetDeviceID(),
+		pDevEntry:                         apDevEntry,
+		pOmciCC:                           apDevEntry.GetDevOmciCC(),
+		pOnuDB:                            apOnuDB,
+		requestEvent:                      aRequestEvent,
+		omciSectionInterleaveDelay:        cOmciSectionInterleaveMilliseconds,
+		downloadToOnuTimeout4MB:           apDeviceHandler.GetDlToOnuTimeout4M(),
+		waitCountEndSwDl:                  cWaitCountEndSwDl,
+		waitDelayEndSwDl:                  cWaitDelayEndSwDlSeconds,
+		upgradePhase:                      cUpgradeUndefined,
+		volthaDownloadState:               voltha.ImageState_DOWNLOAD_UNKNOWN,
+		volthaDownloadReason:              voltha.ImageState_NO_ERROR,
+		volthaImageState:                  voltha.ImageState_IMAGE_UNKNOWN,
+		abortRequested:                    voltha.ImageState_NO_ERROR,
+		maxRetryAttemptsForDownloadWindow: cMaxRetryAttemptsForDownloadWindow,
 	}
 	instFsm.chReceiveExpectedResponse = make(chan bool)
 	instFsm.chAdapterDlReady = make(chan bool)
@@ -1500,8 +1508,14 @@
 	logger.Debugw(ctx, "OnuUpgradeFsm DlSectionResponse Data", log.Fields{
 		"device-id": oFsm.deviceID, "data-fields": msgObj})
 	if msgObj.Result != me.Success {
-		logger.Errorw(ctx, "OnuUpgradeFsm DlSectionResponse result error - later: repeat window once?", //TODO!!!
+		logger.Warnf(ctx, "OnuUpgradeFsm DlSectionResponse result error - later: repeat window once?",
 			log.Fields{"device-id": oFsm.deviceID, "Error": msgObj.Result})
+		if msgObj.Result == me.ProcessingError {
+			oFsm.currentErrState = ResultErr
+			oFsm.mutexUpgradeParams.Lock()
+			oFsm.retrySoftwareDownload(ctx)
+			return
+		}
 		oFsm.abortOnOmciError(ctx, false)
 		return
 	}
@@ -2017,3 +2031,22 @@
 	oFsm.pDevEntry = nil
 	oFsm.pOmciCC = nil
 }
+
+func (oFsm *OnuUpgradeFsm) retrySoftwareDownload(ctx context.Context) {
+	logger.Infow(ctx, "Number of retry attempts remaining for the download window ", log.Fields{"error": oFsm.currentErrState,
+		"device-id": oFsm.deviceID, "maxRetryAttemptsForDownloadWindow": oFsm.maxRetryAttemptsForDownloadWindow})
+	if oFsm.maxRetryAttemptsForDownloadWindow > 0 {
+		oFsm.maxRetryAttemptsForDownloadWindow--
+		oFsm.nextDownloadSectionsWindow = 0 // resets the section for current window
+
+		// reset absolute section counter to the start of the current window
+		oFsm.nextDownloadSectionsAbsolute = oFsm.nextDownloadWindow * uint32(oFsm.omciDownloadWindowSizeLimit+1)
+		oFsm.mutexUpgradeParams.Unlock()
+		_ = oFsm.PAdaptFsm.PFsm.Event(UpgradeEvContinueNextWindow)
+	} else {
+		oFsm.mutexUpgradeParams.Unlock()
+		logger.Errorw(ctx, "OnuUpgradeFsm DlSectionResponse, max limit reached", log.Fields{"error": oFsm.currentErrState,
+			"device-id": oFsm.deviceID})
+		oFsm.abortOnOmciError(ctx, false)
+	}
+}