[VOL-4352] Device reconcile fails if flows are removed while the openonu-adapter is down

Change-Id: I2f39371bd4d6c30a147690d845088969e8a2a003
diff --git a/internal/pkg/core/device_handler.go b/internal/pkg/core/device_handler.go
index d651df5..2ef1926 100755
--- a/internal/pkg/core/device_handler.go
+++ b/internal/pkg/core/device_handler.go
@@ -47,6 +47,7 @@
 	vc "github.com/opencord/voltha-protos/v5/go/common"
 	ca "github.com/opencord/voltha-protos/v5/go/core_adapter"
 	"github.com/opencord/voltha-protos/v5/go/extension"
+	"github.com/opencord/voltha-protos/v5/go/inter_adapter"
 	ia "github.com/opencord/voltha-protos/v5/go/inter_adapter"
 	of "github.com/opencord/voltha-protos/v5/go/openflow_13"
 	"github.com/opencord/voltha-protos/v5/go/openolt"
@@ -204,6 +205,8 @@
 	upgradeCanceled                bool
 	reconciling                    uint8
 	mutexReconcilingFlag           sync.RWMutex
+	reconcilingReasonUpdate        bool
+	mutexReconcilingReasonUpdate   sync.RWMutex
 	chUniVlanConfigReconcilingDone chan uint16 //channel to indicate that VlanConfig reconciling for a specific UNI has been finished
 	chReconcilingFinished          chan bool   //channel to indicate that reconciling has been finished
 	reconcileExpiryComplete        time.Duration
@@ -249,6 +252,7 @@
 	dh.lockUpgradeFsm = sync.RWMutex{}
 	dh.UniVlanConfigFsmMap = make(map[uint8]*avcfg.UniVlanConfigFsm)
 	dh.reconciling = cNoReconciling
+	dh.reconcilingReasonUpdate = false
 	dh.chReconcilingFinished = make(chan bool)
 	dh.reconcileExpiryComplete = adapter.maxTimeoutReconciling //assumption is to have it as duration in s!
 	rECSeconds := int(dh.reconcileExpiryComplete / time.Second)
@@ -850,7 +854,7 @@
 
 	pDevEntry := dh.GetOnuDeviceEntry(ctx, true)
 	if pDevEntry == nil {
-		logger.Errorw(ctx, "No valid OnuDevice - aborting", log.Fields{"device-id": dh.DeviceID})
+		logger.Errorw(ctx, "reconciling - no valid OnuDevice - aborting", log.Fields{"device-id": dh.DeviceID})
 		dh.stopReconciling(ctx, false, cWaitReconcileFlowNoActivity)
 		return continueWithFlowConfig
 	}
@@ -871,10 +875,11 @@
 	techProfInstLoadFailed := false
 outerLoop:
 	for _, uniData := range pDevEntry.SOnuPersistentData.PersUniConfig {
+		uniID := uniData.PersUniID
 		//TODO: check for uni-port specific reconcilement in case of multi-uni-port-per-onu-support
 		if !dh.anyTpPathExists(uniData.PersTpPathMap) {
 			logger.Debugw(ctx, "reconciling - no TPs stored for uniID",
-				log.Fields{"uni-id": uniData.PersUniID, "device-id": dh.DeviceID})
+				log.Fields{"uni-id": uniID, "device-id": dh.DeviceID})
 			continue
 		}
 		//release MutexPersOnuConfig before TechProfile (ANIConfig) processing as otherwise the reception of
@@ -883,37 +888,28 @@
 		pDevEntry.MutexPersOnuConfig.RUnlock()
 		persMutexLock = false
 		techProfsFound = true // set to true if we found TP once for any UNI port
+		var iaTechTpInst ia.TechProfileDownloadMessage
+		var ok bool
 		for tpID := range uniData.PersTpPathMap {
-			// Request the TpInstance again from the openolt adapter in case of reconcile
-			iaTechTpInst, err := dh.getTechProfileInstanceFromParentAdapter(ctx,
-				dh.device.ProxyAddress.AdapterEndpoint,
-				&ia.TechProfileInstanceRequestMessage{
-					DeviceId:       dh.device.Id,
-					TpInstancePath: uniData.PersTpPathMap[tpID],
-					ParentDeviceId: dh.parentID,
-					ParentPonPort:  dh.device.ParentPortNo,
-					OnuId:          dh.device.ProxyAddress.OnuId,
-					UniId:          uint32(uniData.PersUniID),
-				})
-			if err != nil || iaTechTpInst == nil {
-				// TODO: During the absence of the ONU adapter there seem to have been TP specific configurations!
-				// The no longer available TPs and the associated flows must be deleted from the ONU KV store
-				// and after a MIB reset a new reconciling attempt with OMCI configuration must be started.
-				logger.Errorw(ctx, "error fetching tp instance",
+			pDevEntry.MutexReconciledTpInstances.RLock()
+			if iaTechTpInst, ok = pDevEntry.ReconciledTpInstances[uniID][tpID]; !ok {
+				logger.Errorw(ctx, "reconciling - no reconciled tp instance available",
 					log.Fields{"tp-id": tpID, "tpPath": uniData.PersTpPathMap[tpID], "uni-id": uniData.PersUniID,
-						"device-id": dh.DeviceID, "err": err})
+						"device-id": dh.DeviceID})
 				techProfInstLoadFailed = true // stop loading tp instance as soon as we hit failure
+				pDevEntry.MutexReconciledTpInstances.RUnlock()
 				break outerLoop
 			}
+			pDevEntry.MutexReconciledTpInstances.RUnlock()
 			continueWithFlowConfig = true // valid TP found - try flow configuration later
 			var tpInst tech_profile.TechProfileInstance
 			switch techTpInst := iaTechTpInst.TechTpInstance.(type) {
 			case *ia.TechProfileDownloadMessage_TpInstance: // supports only GPON, XGPON, XGS-PON
 				tpInst = *techTpInst.TpInstance
-				logger.Debugw(ctx, "received-tp-instance-successfully-after-reconcile", log.Fields{
+				logger.Debugw(ctx, "reconciling - received-tp-instance-successfully-after-reconcile", log.Fields{
 					"tp-id": tpID, "tpPath": uniData.PersTpPathMap[tpID], "uni-id": uniData.PersUniID, "device-id": dh.DeviceID})
 			default: // do not support epon or other tech
-				logger.Errorw(ctx, "unsupported-tech-profile", log.Fields{
+				logger.Errorw(ctx, "reconciling - unsupported-tech-profile", log.Fields{
 					"tp-id": tpID, "tpPath": uniData.PersTpPathMap[tpID], "uni-id": uniData.PersUniID, "device-id": dh.DeviceID})
 				techProfInstLoadFailed = true // stop loading tp instance as soon as we hit failure
 				break outerLoop
@@ -960,11 +956,11 @@
 		return
 	}
 	if abTechProfInstLoadFailed {
-		_ = dh.ReasonUpdate(ctx, cmn.DrTechProfileConfigDownloadFailed, false)
+		_ = dh.ReasonUpdate(ctx, cmn.DrTechProfileConfigDownloadFailed, dh.IsReconcilingReasonUpdate())
 		dh.stopReconciling(ctx, false, cWaitReconcileFlowNoActivity)
 		return
 	} else if dh.IsSkipOnuConfigReconciling() {
-		_ = dh.ReasonUpdate(ctx, cmn.DrTechProfileConfigDownloadSuccess, false)
+		_ = dh.ReasonUpdate(ctx, cmn.DrTechProfileConfigDownloadSuccess, dh.IsReconcilingReasonUpdate())
 	}
 	if !abFlowsFound {
 		logger.Debugw(ctx, "reconciling - no flows have been stored before adapter restart - terminate reconcilement",
@@ -978,7 +974,7 @@
 
 	pDevEntry := dh.GetOnuDeviceEntry(ctx, true)
 	if pDevEntry == nil {
-		logger.Errorw(ctx, "No valid OnuDevice - aborting", log.Fields{"device-id": dh.DeviceID})
+		logger.Errorw(ctx, "reconciling - no valid OnuDevice - aborting", log.Fields{"device-id": dh.DeviceID})
 		dh.stopReconciling(ctx, false, cWaitReconcileFlowNoActivity)
 		return
 	}
@@ -1056,7 +1052,7 @@
 				pDevEntry.SendChReconcilingFlowsFinished(true)
 			}
 		} else {
-			logger.Errorw(ctx, "timeout waiting for reconciling flows for all UNI's to be finished!",
+			logger.Errorw(ctx, "reconciling - timeout waiting for reconciling flows for all UNI's to be finished!",
 				log.Fields{"device-id": dh.DeviceID})
 			dh.stopReconciling(ctx, false, cWaitReconcileFlowAbortOnError)
 			if pDevEntry != nil {
@@ -1064,7 +1060,7 @@
 			}
 			return
 		}
-		_ = dh.ReasonUpdate(ctx, cmn.DrOmciFlowsPushed, false)
+		_ = dh.ReasonUpdate(ctx, cmn.DrOmciFlowsPushed, dh.IsReconcilingReasonUpdate())
 	}
 }
 
@@ -1989,8 +1985,7 @@
 	if err := pDevEntry.Start(log.WithSpanFromContext(context.TODO(), ctx)); err != nil {
 		return err
 	}
-
-	_ = dh.ReasonUpdate(ctx, cmn.DrStartingOpenomci, !dh.IsReconciling())
+	_ = dh.ReasonUpdate(ctx, cmn.DrStartingOpenomci, !dh.IsReconciling() || dh.IsReconcilingReasonUpdate())
 
 	/* this might be a good time for Omci Verify message?  */
 	verifyExec := make(chan bool)
@@ -2104,9 +2099,9 @@
 					return fmt.Errorf("can't go to state resetting_mib: %s", dh.DeviceID)
 				}
 			} else {
-				if err := pMibUlFsm.Event(mib.UlEvExamineMds); err != nil {
-					logger.Errorw(ctx, "MibSyncFsm: Can't go to state examine_mds", log.Fields{"device-id": dh.DeviceID, "err": err})
-					return fmt.Errorf("can't go to examine_mds: %s", dh.DeviceID)
+				if err := pMibUlFsm.Event(mib.UlEvVerifyAndStoreTPs); err != nil {
+					logger.Errorw(ctx, "MibSyncFsm: Can't go to state verify and store TPs", log.Fields{"device-id": dh.DeviceID, "err": err})
+					return fmt.Errorf("can't go to state verify and store TPs: %s", dh.DeviceID)
 				}
 				logger.Debugw(ctx, "state of MibSyncFsm", log.Fields{"state": string(pMibUlFsm.Current())})
 			}
@@ -2285,7 +2280,7 @@
 		logger.Warnw(ctx, "store persistent data error - continue as there will be additional write attempts",
 			log.Fields{"device-id": dh.DeviceID, "err": err})
 	}
-	_ = dh.ReasonUpdate(ctx, cmn.DrDiscoveryMibsyncComplete, !dh.IsReconciling())
+	_ = dh.ReasonUpdate(ctx, cmn.DrDiscoveryMibsyncComplete, !dh.IsReconciling() || dh.IsReconcilingReasonUpdate())
 	dh.AddAllUniPorts(ctx)
 
 	/* 200605: lock processing after initial MIBUpload removed now as the ONU should be in the lock state per default here */
@@ -2369,7 +2364,7 @@
 		logger.Debugw(ctx, "reconciling - don't notify core about DeviceStateUpdate to ACTIVE",
 			log.Fields{"device-id": dh.DeviceID})
 	}
-	_ = dh.ReasonUpdate(ctx, cmn.DrInitialMibDownloaded, !dh.IsReconciling())
+	_ = dh.ReasonUpdate(ctx, cmn.DrInitialMibDownloaded, !dh.IsReconciling() || dh.IsReconcilingReasonUpdate())
 
 	if !dh.GetCollectorIsRunning() {
 		// Start PM collector routine
@@ -2537,7 +2532,7 @@
 		//  - which may cause some inconsistency
 		if dh.getDeviceReason() != cmn.DrTechProfileConfigDownloadSuccess {
 			// which may be the case from some previous activity even on this UNI Port (but also other UNI ports)
-			_ = dh.ReasonUpdate(ctx, cmn.DrTechProfileConfigDownloadSuccess, !dh.IsReconciling())
+			_ = dh.ReasonUpdate(ctx, cmn.DrTechProfileConfigDownloadSuccess, !dh.IsReconciling() || dh.IsReconcilingReasonUpdate())
 		}
 		if dh.IsReconciling() {
 			go dh.ReconcileDeviceFlowConfig(ctx)
@@ -2563,7 +2558,7 @@
 		if dh.getDeviceReason() != cmn.DrOmciFlowsPushed {
 			// which may be the case from some previous activity on another UNI Port of the ONU
 			// or even some previous flow add activity on the same port
-			_ = dh.ReasonUpdate(ctx, cmn.DrOmciFlowsPushed, !dh.IsReconciling())
+			_ = dh.ReasonUpdate(ctx, cmn.DrOmciFlowsPushed, !dh.IsReconciling() || dh.IsReconcilingReasonUpdate())
 			if dh.IsReconciling() {
 				go dh.reconcileEnd(ctx)
 			}
@@ -4051,6 +4046,15 @@
 			dh.mutexReconcilingFlag.Lock()
 			dh.reconciling = cNoReconciling
 			dh.mutexReconcilingFlag.Unlock()
+			dh.SetReconcilingReasonUpdate(false)
+
+			if onuDevEntry := dh.GetOnuDeviceEntry(ctx, true); onuDevEntry == nil {
+				logger.Errorw(ctx, "No valid OnuDevice", log.Fields{"device-id": dh.DeviceID})
+			} else {
+				onuDevEntry.MutexReconciledTpInstances.Lock()
+				onuDevEntry.ReconciledTpInstances = make(map[uint8]map[uint8]inter_adapter.TechProfileDownloadMessage)
+				onuDevEntry.MutexReconciledTpInstances.Unlock()
+			}
 		}()
 	}
 	dh.mutexReconcilingFlag.Lock()
@@ -4086,6 +4090,18 @@
 	return dh.reconciling == cSkipOnuConfigReconciling
 }
 
+func (dh *deviceHandler) SetReconcilingReasonUpdate(value bool) {
+	dh.mutexReconcilingReasonUpdate.Lock()
+	dh.reconcilingReasonUpdate = value
+	dh.mutexReconcilingReasonUpdate.Unlock()
+}
+
+func (dh *deviceHandler) IsReconcilingReasonUpdate() bool {
+	dh.mutexReconcilingReasonUpdate.RLock()
+	defer dh.mutexReconcilingReasonUpdate.RUnlock()
+	return dh.reconcilingReasonUpdate
+}
+
 func (dh *deviceHandler) getDeviceReason() uint8 {
 	dh.mutexDeviceReason.RLock()
 	value := dh.deviceReason
@@ -4217,16 +4233,26 @@
 Helper functions to communicate with parent adapter
 */
 
-func (dh *deviceHandler) getTechProfileInstanceFromParentAdapter(ctx context.Context, parentEndpoint string,
-	request *ia.TechProfileInstanceRequestMessage) (*ia.TechProfileDownloadMessage, error) {
-	pgClient, err := dh.pOpenOnuAc.getParentAdapterServiceClient(parentEndpoint)
+func (dh *deviceHandler) GetTechProfileInstanceFromParentAdapter(ctx context.Context, aUniID uint8,
+	aTpPath string) (*ia.TechProfileDownloadMessage, error) {
+
+	var request = ia.TechProfileInstanceRequestMessage{
+		DeviceId:       dh.DeviceID,
+		TpInstancePath: aTpPath,
+		ParentDeviceId: dh.parentID,
+		ParentPonPort:  dh.device.ParentPortNo,
+		OnuId:          dh.device.ProxyAddress.OnuId,
+		UniId:          uint32(aUniID),
+	}
+
+	pgClient, err := dh.pOpenOnuAc.getParentAdapterServiceClient(dh.device.ProxyAddress.AdapterEndpoint)
 	if err != nil || pgClient == nil {
 		return nil, err
 	}
 	subCtx, cancel := context.WithTimeout(log.WithSpanFromContext(context.Background(), ctx), dh.config.MaxTimeoutInterAdapterComm)
 	defer cancel()
-	logger.Debugw(subCtx, "get-tech-profile-instance", log.Fields{"request": request, "parent-endpoint": parentEndpoint})
-	return pgClient.GetTechProfileInstance(subCtx, request)
+	logger.Debugw(subCtx, "get-tech-profile-instance", log.Fields{"request": request, "parent-endpoint": dh.device.ProxyAddress.AdapterEndpoint})
+	return pgClient.GetTechProfileInstance(subCtx, &request)
 }
 
 // This routine is unique per ONU ID and blocks on flowControlBlock channel for incoming flows