VOL-3052 Onu Software upgrade correction and remove internal test functionality

Signed-off-by: mpagenko <michael.pagenkopf@adtran.com>
Change-Id: Icf0e72648fe56a86f7cc15d57750a6566c431daa
diff --git a/internal/pkg/onuadaptercore/omci_onu_upgrade.go b/internal/pkg/onuadaptercore/omci_onu_upgrade.go
index fd7df97..0158977 100644
--- a/internal/pkg/onuadaptercore/omci_onu_upgrade.go
+++ b/internal/pkg/onuadaptercore/omci_onu_upgrade.go
@@ -40,6 +40,8 @@
 	//cOmciDownloadWindowRetryMax  = 2    // max attempts for a specific window
 	cOmciSectionInterleaveMilliseconds = 100 //DownloadSection interleave time in milliseconds
 	cOmciEndSwDlDelaySeconds           = 1   //End Software Download delay after last section (may be also configurable?)
+	cWaitCountEndSwDl                  = 6   //maximum number of EndSwDl requests
+	cWaitDelayEndSwDlSeconds           = 10  //duration, how long is waited before next request on EndSwDl
 	//cOmciDownloadCompleteTimeout = 5400 //in s for the complete timeout (may be better scale to image size/ noOfWindows)
 )
 
@@ -51,6 +53,8 @@
 	upgradeEvWaitWindowAck      = "upgradeEvWaitWindowAck"
 	upgradeEvContinueNextWindow = "upgradeEvContinueNextWindow"
 	upgradeEvEndSwDownload      = "upgradeEvEndSwDownload"
+	upgradeEvWaitEndDownload    = "upgradeEvWaitEndDownload"
+	upgradeEvContinueFinalize   = "upgradeEvContinueFinalize"
 	upgradeEvRequestActivate    = "upgradeEvRequestActivate"
 	upgradeEvWaitForCommit      = "upgradeEvWaitForCommit"
 	upgradeEvCommitSw           = "upgradeEvCommitSw"
@@ -71,6 +75,7 @@
 	upgradeStDLSection          = "upgradeStDLSection"
 	upgradeStVerifyWindow       = "upgradeStVerifyWindow"
 	upgradeStFinalizeDL         = "upgradeStFinalizeDL"
+	upgradeStWaitEndDL          = "upgradeStWaitEndDL"
 	upgradeStRequestingActivate = "upgradeStRequestingActivate"
 	upgradeStWaitForCommit      = "upgradeStWaitForCommit"
 	upgradeStCommitSw           = "upgradeStCommitSw"
@@ -91,23 +96,26 @@
 	pOnuDB           *onuDeviceDB
 	requestEvent     OnuDeviceEvent
 	//omciMIdsResponseReceived chan bool //seperate channel needed for checking multiInstance OMCI message responses
-	pAdaptFsm                         *AdapterFsm
-	pImageDsc                         *voltha.ImageDownload
-	imageBuffer                       []byte
-	origImageLength                   uint32        //as also limited by OMCI
-	imageLength                       uint32        //including last bytes padding
-	omciDownloadWindowSizeLimit       uint8         //windowSize-1 in sections
-	omciDownloadWindowSizeLast        uint8         //number of sections in last window
-	noOfSections                      uint32        //uint32 range for sections should be sufficient for very long images
-	nextDownloadSectionsAbsolute      uint32        //number of next section to download in overall image
-	nextDownloadSectionsWindow        uint8         //number of next section to download within current window
-	noOfWindows                       uint32        //uint32 range for windows should be sufficient for very long images
-	nextDownloadWindow                uint32        //number of next window to download
-	inactiveImageMeID                 uint16        //ME-ID of the inactive image
-	omciSectionInterleaveMilliseconds time.Duration //DownloadSectionInterleave delay in milliseconds
-	delayEndSwDl                      bool          //flag to provide a delay between last section and EndSwDl
-	pLastTxMeInstance                 *me.ManagedEntity
-	useSoftReboot                     bool
+	pAdaptFsm                    *AdapterFsm
+	pImageDsc                    *voltha.ImageDownload
+	imageBuffer                  []byte
+	origImageLength              uint32        //as also limited by OMCI
+	imageCRC                     uint32        //as per OMCI - ITU I.363.5 crc
+	imageLength                  uint32        //including last bytes padding
+	omciDownloadWindowSizeLimit  uint8         //windowSize-1 in sections
+	omciDownloadWindowSizeLast   uint8         //number of sections in last window
+	noOfSections                 uint32        //uint32 range for sections should be sufficient for very long images
+	nextDownloadSectionsAbsolute uint32        //number of next section to download in overall image
+	nextDownloadSectionsWindow   uint8         //number of next section to download within current window
+	noOfWindows                  uint32        //uint32 range for windows should be sufficient for very long images
+	nextDownloadWindow           uint32        //number of next window to download
+	inactiveImageMeID            uint16        //ME-ID of the inactive image
+	omciSectionInterleaveDelay   time.Duration //DownloadSectionInterleave delay in milliseconds
+	delayEndSwDl                 bool          //flag to provide a delay between last section and EndSwDl
+	pLastTxMeInstance            *me.ManagedEntity
+	waitCountEndSwDl             uint8         //number, how often is waited for EndSwDl at maximum
+	waitDelayEndSwDl             time.Duration //duration, how long is waited before next request on EndSwDl
+	chReceiveExpectedResponse    chan bool
 }
 
 //NewOnuUpgradeFsm is the 'constructor' for the state machine to config the PON ANI ports
@@ -116,15 +124,18 @@
 	apDevEntry *OnuDeviceEntry, apOnuDB *onuDeviceDB,
 	aRequestEvent OnuDeviceEvent, aName string, aCommChannel chan Message) *OnuUpgradeFsm {
 	instFsm := &OnuUpgradeFsm{
-		pDeviceHandler:                    apDeviceHandler,
-		deviceID:                          apDeviceHandler.deviceID,
-		pOnuOmciDevice:                    apDevEntry,
-		pOmciCC:                           apDevEntry.PDevOmciCC,
-		pOnuDB:                            apOnuDB,
-		requestEvent:                      aRequestEvent,
-		omciDownloadWindowSizeLimit:       cOmciDownloadWindowSizeLimit,
-		omciSectionInterleaveMilliseconds: cOmciSectionInterleaveMilliseconds,
+		pDeviceHandler:              apDeviceHandler,
+		deviceID:                    apDeviceHandler.deviceID,
+		pOnuOmciDevice:              apDevEntry,
+		pOmciCC:                     apDevEntry.PDevOmciCC,
+		pOnuDB:                      apOnuDB,
+		requestEvent:                aRequestEvent,
+		omciDownloadWindowSizeLimit: cOmciDownloadWindowSizeLimit,
+		omciSectionInterleaveDelay:  cOmciSectionInterleaveMilliseconds,
+		waitCountEndSwDl:            cWaitCountEndSwDl,
+		waitDelayEndSwDl:            cWaitDelayEndSwDlSeconds,
 	}
+	instFsm.chReceiveExpectedResponse = make(chan bool)
 
 	instFsm.pAdaptFsm = NewAdapterFsm(aName, instFsm.deviceID, aCommChannel)
 	if instFsm.pAdaptFsm == nil {
@@ -141,7 +152,9 @@
 			{Name: upgradeEvWaitWindowAck, Src: []string{upgradeStDLSection}, Dst: upgradeStVerifyWindow},
 			{Name: upgradeEvContinueNextWindow, Src: []string{upgradeStVerifyWindow}, Dst: upgradeStDLSection},
 			{Name: upgradeEvEndSwDownload, Src: []string{upgradeStVerifyWindow}, Dst: upgradeStFinalizeDL},
-			{Name: upgradeEvRequestActivate, Src: []string{upgradeStFinalizeDL}, Dst: upgradeStRequestingActivate},
+			{Name: upgradeEvWaitEndDownload, Src: []string{upgradeStFinalizeDL}, Dst: upgradeStWaitEndDL},
+			{Name: upgradeEvContinueFinalize, Src: []string{upgradeStWaitEndDL}, Dst: upgradeStFinalizeDL},
+			{Name: upgradeEvRequestActivate, Src: []string{upgradeStWaitEndDL}, Dst: upgradeStRequestingActivate},
 			{Name: upgradeEvWaitForCommit, Src: []string{upgradeStRequestingActivate}, Dst: upgradeStWaitForCommit},
 			{Name: upgradeEvCommitSw, Src: []string{upgradeStStarting, upgradeStWaitForCommit},
 				Dst: upgradeStCommitSw},
@@ -155,11 +168,11 @@
 			*/
 			// exceptional treatments
 			{Name: upgradeEvReset, Src: []string{upgradeStStarting, upgradeStPreparingDL, upgradeStDLSection,
-				upgradeStVerifyWindow, upgradeStDLSection, upgradeStFinalizeDL, upgradeStRequestingActivate,
+				upgradeStVerifyWindow, upgradeStDLSection, upgradeStFinalizeDL, upgradeStWaitEndDL, upgradeStRequestingActivate,
 				upgradeStCommitSw, upgradeStCheckCommitted}, //upgradeStWaitForCommit is not reset (later perhaps also not upgradeStWaitActivate)
 				Dst: upgradeStResetting},
 			{Name: upgradeEvAbort, Src: []string{upgradeStStarting, upgradeStPreparingDL, upgradeStDLSection,
-				upgradeStVerifyWindow, upgradeStDLSection, upgradeStFinalizeDL, upgradeStRequestingActivate,
+				upgradeStVerifyWindow, upgradeStDLSection, upgradeStFinalizeDL, upgradeStWaitEndDL, upgradeStRequestingActivate,
 				upgradeStWaitForCommit, upgradeStCommitSw, upgradeStCheckCommitted},
 				Dst: upgradeStResetting},
 			{Name: upgradeEvRestart, Src: []string{upgradeStResetting}, Dst: upgradeStDisabled},
@@ -171,6 +184,7 @@
 			"enter_" + upgradeStDLSection:          func(e *fsm.Event) { instFsm.enterDownloadSection(ctx, e) },
 			"enter_" + upgradeStVerifyWindow:       func(e *fsm.Event) { instFsm.enterVerifyWindow(ctx, e) },
 			"enter_" + upgradeStFinalizeDL:         func(e *fsm.Event) { instFsm.enterFinalizeDL(ctx, e) },
+			"enter_" + upgradeStWaitEndDL:          func(e *fsm.Event) { instFsm.enterWaitEndDL(ctx, e) },
 			"enter_" + upgradeStRequestingActivate: func(e *fsm.Event) { instFsm.enterActivateSw(ctx, e) },
 			"enter_" + upgradeStCommitSw:           func(e *fsm.Event) { instFsm.enterCommitSw(ctx, e) },
 			"enter_" + upgradeStCheckCommitted:     func(e *fsm.Event) { instFsm.enterCheckCommitted(ctx, e) },
@@ -197,14 +211,6 @@
 			"device-id": oFsm.deviceID, "image-description": apImageDsc})
 		oFsm.inactiveImageMeID = aInactiveImageID //upgrade state machines run on configured inactive ImageId
 		oFsm.pImageDsc = apImageDsc
-		//path overwrite for internal test file usage
-		oFsm.useSoftReboot = false
-		if apImageDsc.LocalDir == "/intern" {
-			oFsm.pImageDsc.LocalDir = "/tmp"
-		} else if apImageDsc.LocalDir == "/reboot" {
-			oFsm.useSoftReboot = true
-			oFsm.pImageDsc.LocalDir = "/tmp"
-		}
 		oFsm.pDownloadManager = apDownloadManager
 
 		go func(aPBaseFsm *fsm.FSM) {
@@ -354,9 +360,9 @@
 		}
 		framePrint = false                //for the next Section frame (if wanted, can be enabled in logic before sendXXX())
 		oFsm.nextDownloadSectionsWindow++ //increase the window related section counter only if not in the last section
-		if oFsm.omciSectionInterleaveMilliseconds > 0 {
+		if oFsm.omciSectionInterleaveDelay > 0 {
 			//ensure a defined intersection-time-gap to leave space for further processing, other ONU's ...
-			time.Sleep(oFsm.omciSectionInterleaveMilliseconds * time.Millisecond)
+			time.Sleep(oFsm.omciSectionInterleaveDelay * time.Millisecond)
 		}
 	}
 }
@@ -367,9 +373,8 @@
 }
 
 func (oFsm *OnuUpgradeFsm) enterFinalizeDL(ctx context.Context, e *fsm.Event) {
-	imageCRC := crc32a.Checksum(oFsm.imageBuffer[:int(oFsm.origImageLength)]) //ITU I.363.5 crc
 	logger.Infow(ctx, "OnuUpgradeFsm finalize DL", log.Fields{
-		"device-id": oFsm.deviceID, "crc": strconv.FormatInt(int64(imageCRC), 16), "delay": oFsm.delayEndSwDl})
+		"device-id": oFsm.deviceID, "crc": strconv.FormatInt(int64(oFsm.imageCRC), 16), "delay": oFsm.delayEndSwDl})
 
 	if oFsm.delayEndSwDl {
 		//give the ONU some time for image evaluation (hoping it does not base that on first EndSwDl itself)
@@ -377,19 +382,92 @@
 		time.Sleep(cOmciEndSwDlDelaySeconds * time.Second)
 	}
 
+	pBaseFsm := oFsm.pAdaptFsm
+	if pBaseFsm == nil {
+		logger.Errorw(ctx, "EndSwDl abort: BaseFsm invalid", log.Fields{
+			"device-id": oFsm.deviceID})
+		//TODO!!!: define some more sophisticated error treatment with some repetition, for now just reset the FSM
+		// Can't call FSM Event directly, decoupling it
+		go func(a_pAFsm *AdapterFsm) {
+			_ = a_pAFsm.pFsm.Event(upgradeEvAbort)
+		}(pBaseFsm)
+		return
+	}
 	err := oFsm.pOmciCC.sendEndSoftwareDownload(log.WithSpanFromContext(context.TODO(), ctx), oFsm.pDeviceHandler.pOpenOnuAc.omciTimeout, false,
-		oFsm.pAdaptFsm.commChan, oFsm.inactiveImageMeID, oFsm.origImageLength, imageCRC)
+		oFsm.pAdaptFsm.commChan, oFsm.inactiveImageMeID, oFsm.origImageLength, oFsm.imageCRC)
 	if err != nil {
 		logger.Errorw(ctx, "EndSwDl abort: can't send section", log.Fields{
 			"device-id": oFsm.deviceID, "error": err})
 		//TODO!!!: define some more sophisticated error treatment with some repetition, for now just reset the FSM
-		pBaseFsm := oFsm.pAdaptFsm
 		// Can't call FSM Event directly, decoupling it
 		go func(a_pAFsm *AdapterFsm) {
 			_ = a_pAFsm.pFsm.Event(upgradeEvAbort)
 		}(pBaseFsm)
 		return
 	}
+	// go waiting for the EndSwDLResponse and check, if the ONU is ready for activation
+	// Can't call FSM Event directly, decoupling it
+	go func(a_pAFsm *AdapterFsm) {
+		_ = a_pAFsm.pFsm.Event(upgradeEvWaitEndDownload)
+	}(pBaseFsm)
+}
+
+func (oFsm *OnuUpgradeFsm) enterWaitEndDL(ctx context.Context, e *fsm.Event) {
+	logger.Infow(ctx, "OnuUpgradeFsm WaitEndDl", log.Fields{
+		"device-id": oFsm.deviceID, "wait delay": oFsm.waitDelayEndSwDl * time.Second, "wait count": oFsm.waitCountEndSwDl})
+	if oFsm.waitCountEndSwDl == 0 {
+		logger.Errorw(ctx, "WaitEndDl abort: max limit of EndSwDL reached", log.Fields{
+			"device-id": oFsm.deviceID})
+		pBaseFsm := oFsm.pAdaptFsm
+		if pBaseFsm == nil {
+			logger.Errorw(ctx, "WaitEndDl abort: BaseFsm invalid", log.Fields{
+				"device-id": oFsm.deviceID})
+			return
+		}
+		go func(a_pAFsm *AdapterFsm) {
+			_ = a_pAFsm.pFsm.Event(upgradeEvAbort)
+		}(pBaseFsm)
+		return
+	}
+
+	oFsm.waitCountEndSwDl--
+	select {
+	case <-time.After(oFsm.waitDelayEndSwDl * time.Second):
+		pBaseFsm := oFsm.pAdaptFsm
+		if pBaseFsm == nil {
+			logger.Errorw(ctx, "WaitEndDl abort: BaseFsm invalid", log.Fields{
+				"device-id": oFsm.deviceID})
+			//FSM may be reset already from somewhere else, nothing we can do here anymore
+			return
+		}
+		//retry End SW DL
+		oFsm.delayEndSwDl = false //no more extra delay for the request
+		go func(a_pAFsm *AdapterFsm) {
+			_ = a_pAFsm.pFsm.Event(upgradeEvContinueFinalize)
+		}(pBaseFsm)
+		return
+	case success := <-oFsm.chReceiveExpectedResponse:
+		logger.Debugw(ctx, "WaitEndDl stop  wait timer", log.Fields{"device-id": oFsm.deviceID})
+		pBaseFsm := oFsm.pAdaptFsm
+		if pBaseFsm == nil {
+			logger.Errorw(ctx, "WaitEndDl abort: BaseFsm invalid", log.Fields{
+				"device-id": oFsm.deviceID})
+			//FSM may be reset already from somewhere else, nothing we can do here anymore
+			return
+		}
+		if success {
+			//answer received with ready indication
+			go func(a_pAFsm *AdapterFsm) {
+				_ = a_pAFsm.pFsm.Event(upgradeEvRequestActivate)
+			}(pBaseFsm)
+			return
+		}
+		//timer was aborted
+		go func(a_pAFsm *AdapterFsm) {
+			_ = a_pAFsm.pFsm.Event(upgradeEvAbort)
+		}(pBaseFsm)
+		return
+	}
 }
 
 func (oFsm *OnuUpgradeFsm) enterActivateSw(ctx context.Context, e *fsm.Event) {
@@ -491,6 +569,12 @@
 
 func (oFsm *OnuUpgradeFsm) enterDisabled(ctx context.Context, e *fsm.Event) {
 	logger.Debugw(ctx, "OnuUpgradeFsm enters disabled state", log.Fields{"device-id": oFsm.deviceID})
+	//flush possible left-over channels
+	select {
+	case <-oFsm.chReceiveExpectedResponse:
+		logger.Debug(ctx, "OnuUpgradeFsm chReceiveExpectedResponse flushed", log.Fields{"for device-id": oFsm.deviceID})
+	default:
+	}
 	if oFsm.pDeviceHandler != nil {
 		//request removal of 'reference' in the Handler (completely clear the FSM and its data)
 		go oFsm.pDeviceHandler.removeOnuUpgradeFsm(ctx)
@@ -637,7 +721,8 @@
 						_ = oFsm.pAdaptFsm.pFsm.Event(upgradeEvAbort)
 						return
 					}
-					oFsm.delayEndSwDl = true //ensure a delay for the EndSwDl message
+					oFsm.delayEndSwDl = true                                                      //ensure a delay for the EndSwDl message
+					oFsm.imageCRC = crc32a.Checksum(oFsm.imageBuffer[:int(oFsm.origImageLength)]) //store internal for multiple usage
 					_ = oFsm.pAdaptFsm.pFsm.Event(upgradeEvEndSwDownload)
 					return
 				}
@@ -681,7 +766,12 @@
 			logger.Debugw(ctx, "OnuUpgradeFsm EndSwDlResponse data", log.Fields{
 				"device-id": oFsm.deviceID, "data-fields": msgObj})
 			if msgObj.Result != me.Success {
-				//TODO!!: Busy must be handled to give the ONU time for internal image storage, perhaps also processing error (CRC incorrect)
+				if msgObj.Result == me.DeviceBusy {
+					//ONU indicates it is still processing the image - let the FSM just wait and then repeat the request
+					logger.Debugw(ctx, "OnuUpgradeFsm EndSwDlResponse busy: waiting before sending new request", log.Fields{
+						"device-id": oFsm.deviceID})
+					return
+				}
 				logger.Errorw(ctx, "OnuUpgradeFsm EndSwDlResponse result error - later: drive FSM to abort state ?",
 					log.Fields{"device-id": oFsm.deviceID, "Error": msgObj.Result})
 				// possibly force FSM into abort or ignore some errors for some messages? store error for mgmt display?
@@ -691,7 +781,7 @@
 			}
 			if msgObj.EntityInstance == oFsm.inactiveImageMeID {
 				logger.Debugw(ctx, "Expected EndSwDlResponse received", log.Fields{"device-id": oFsm.deviceID})
-				_ = oFsm.pAdaptFsm.pFsm.Event(upgradeEvRequestActivate)
+				oFsm.chReceiveExpectedResponse <- true //let the FSM proceed from the waitState
 				return
 			}
 			logger.Errorw(ctx, "OnuUpgradeFsm StartSwDlResponse wrong ME instance: try again (later)?",
@@ -732,10 +822,6 @@
 			if msgObj.EntityInstance == oFsm.inactiveImageMeID {
 				logger.Infow(ctx, "Expected ActivateSwResponse received", log.Fields{"device-id": oFsm.deviceID})
 				_ = oFsm.pAdaptFsm.pFsm.Event(upgradeEvWaitForCommit)
-				if oFsm.useSoftReboot {
-					//TODO:  as long as BBSIM does not fully support upgrade: simulate restart by calling the BBSIM (ONU) reboot
-					go oFsm.pDeviceHandler.rebootDevice(ctx, false, oFsm.pDeviceHandler.device)
-				}
 				return
 			}
 			logger.Errorw(ctx, "OnuUpgradeFsm ActivateSwResponse wrong ME instance: abort",
@@ -829,9 +915,8 @@
 			//a check on the delivered image version is not done, the ONU delivered version might be different from what might have been
 			//  indicated in the download image version string (version must be part of the image content itself)
 			//  so checking that might be quite unreliable
-			// TODO!! workaround for still not valid bbsim load indications (re-use SoftReboot flag for simplicity)
-			if oFsm.useSoftReboot || (msgObj.EntityInstance == oFsm.inactiveImageMeID && imageIsActive == swIsActive &&
-				imageIsCommitted == swIsCommitted) {
+			if msgObj.EntityInstance == oFsm.inactiveImageMeID && imageIsActive == swIsActive &&
+				imageIsCommitted == swIsCommitted {
 				logger.Infow(ctx, "requested SW image committed, releasing OnuUpgrade", log.Fields{"device-id": oFsm.deviceID})
 				//releasing the upgrade FSM
 				_ = oFsm.pAdaptFsm.pFsm.Event(upgradeEvReset)