[VOL-5381] - handle logical agent creation failure

Change-Id: I9dd685117d13456bbcd6db8fd7723fa66967b949
Signed-off-by: Sridhar Ravindra <sridhar.ravindra@radisys.com>
diff --git a/VERSION b/VERSION
index b727628..9608f8e 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-3.6.2
+3.6.3-dev1
diff --git a/rw_core/core/device/logical_agent.go b/rw_core/core/device/logical_agent.go
index b5c5cc0..253a523 100644
--- a/rw_core/core/device/logical_agent.go
+++ b/rw_core/core/device/logical_agent.go
@@ -57,6 +57,7 @@
 	orderedEvents   orderedEvents
 	startOnce       sync.Once
 	stopOnce        sync.Once
+	exitChannel     chan int
 
 	flowCache   *flow.Cache
 	meterLoader *meter.Loader
@@ -77,6 +78,7 @@
 		flowDecomposer:  fd.NewFlowDecomposer(deviceMgr.getDeviceReadOnly),
 		internalTimeout: internalTimeout,
 		requestQueue:    coreutils.NewRequestQueue(),
+		exitChannel:     make(chan int, 1),
 
 		flowCache:   flow.NewCache(),
 		groupCache:  group.NewCache(),
@@ -86,10 +88,11 @@
 }
 
 // start creates the logical device and add it to the data model
-func (agent *LogicalAgent) start(ctx context.Context, logicalDeviceExist bool, logicalDevice *voltha.LogicalDevice) error {
+func (agent *LogicalAgent) start(ctx context.Context, logicalDeviceExist bool, logicalDevice *voltha.LogicalDevice) {
 	needToStart := false
 	if agent.startOnce.Do(func() { needToStart = true }); !needToStart {
-		return nil
+		logger.Debug(ctx, "starting-logical-device-agent already running")
+		return
 	}
 
 	logger.Infow(ctx, "starting-logical-device-agent", log.Fields{"logical-device-id": agent.logicalDeviceID, "load-from-db": logicalDeviceExist})
@@ -97,8 +100,8 @@
 	var startSucceeded bool
 	defer func() {
 		if !startSucceeded {
-			if err := agent.stop(ctx); err != nil {
-				logger.Errorw(ctx, "failed-to-cleanup-after-unsuccessful-start", log.Fields{"logical-device-id": agent.logicalDeviceID, "error": err})
+			if stopErr := agent.stop(ctx); stopErr != nil {
+				logger.Errorw(ctx, "failed-to-cleanup-after-unsuccessful-start", log.Fields{"logical-device-id": agent.logicalDeviceID, "error": stopErr})
 			}
 		}
 	}()
@@ -108,15 +111,40 @@
 		//Build the logical device based on information retrieved from the device adapter
 		var switchCap *ca.SwitchCapability
 		var err error
+
 		if switchCap, err = agent.deviceMgr.getSwitchCapability(ctx, agent.rootDeviceID); err != nil {
-			return err
+			logger.Warnw(ctx, "failed-to-get-switch-capability", log.Fields{"root-device-id": agent.rootDeviceID, "error": err})
+			switchCapTicker := time.NewTicker(time.Second * 2)
+			defer switchCapTicker.Stop()
+
+			// Start a retry loop to get switch capability of the OLT device from adapter
+			for {
+				select {
+				case <-switchCapTicker.C:
+					if switchCap, err = agent.deviceMgr.getSwitchCapability(ctx, agent.rootDeviceID); err == nil {
+						logger.Infow(ctx, "received switch capability, proceeding to start logical device agent", log.Fields{"root-device-id": agent.rootDeviceID})
+					}
+					// Before retrying, check if the agent has stopped
+				case _, ok := (<-agent.exitChannel):
+					if !ok {
+						logger.Warnw(ctx, "agent stopped, exit retrying get-switch-capability", log.Fields{"root-device-id": agent.rootDeviceID})
+						return
+					}
+				}
+				// Break the for loop as we have received the switch capability from adapter
+				if err == nil {
+					break
+				}
+				logger.Warnw(ctx, "retrying get-switch-capability", log.Fields{"root-device-id": agent.rootDeviceID, "error": err})
+			}
 		}
 		ld = &voltha.LogicalDevice{Id: agent.logicalDeviceID, RootDeviceId: agent.rootDeviceID}
 
 		// Create the datapath ID (uint64) using the logical device ID (based on the MAC Address)
 		var datapathID uint64
 		if datapathID, err = coreutils.CreateDataPathID(agent.serialNumber); err != nil {
-			return err
+			logger.Errorw(ctx, "failed-to-create-datapath-id", log.Fields{"serial-number": agent.serialNumber, "error": err})
+			return
 		}
 		ld.DatapathId = datapathID
 		ld.Desc = (proto.Clone(switchCap.Desc)).(*ofp.OfpDesc)
@@ -126,7 +154,7 @@
 		// Save the logical device
 		if err := agent.ldProxy.Set(ctx, ld.Id, ld); err != nil {
 			logger.Errorw(ctx, "failed-to-add-logical-device", log.Fields{"logical-device-id": agent.logicalDeviceID})
-			return err
+			return
 		}
 		logger.Debugw(ctx, "logical-device-created", log.Fields{"logical-device-id": agent.logicalDeviceID, "root-id": ld.RootDeviceId})
 
@@ -147,9 +175,12 @@
 			ld = &voltha.LogicalDevice{}
 			have, err := agent.ldProxy.Get(ctx, agent.logicalDeviceID, ld)
 			if err != nil {
-				return err
+				logger.Errorw(ctx, "failed-to-load-logical-device-from-db", log.Fields{"logical-device-id": agent.logicalDeviceID, "error": err})
+				return
 			} else if !have {
-				return status.Errorf(codes.NotFound, "logical_device-%s", agent.logicalDeviceID)
+				err := status.Errorf(codes.NotFound, "logical_device-%s", agent.logicalDeviceID)
+				logger.Errorw(ctx, "logical-device-not-found-in-db", log.Fields{"logical-device-id": agent.logicalDeviceID, "error": err})
+				return
 			}
 		}
 
@@ -178,8 +209,7 @@
 
 	}
 	startSucceeded = true
-
-	return nil
+	agent.ldeviceMgr.addLogicalDeviceAgentToMap(agent)
 }
 
 // stop stops the logical device agent.  This removes the logical device from the data model.
@@ -217,6 +247,7 @@
 		// TODO: remove all entries from all loaders
 		// TODO: don't allow any more modifications to flows/groups/meters/ports or to any logical device field
 
+		close(agent.exitChannel)
 		agent.stopped = true
 
 		logger.Info(ctx, "logical-device-agent-stopped")
diff --git a/rw_core/core/device/logical_manager.go b/rw_core/core/device/logical_manager.go
index ed882a8..3812e2f 100644
--- a/rw_core/core/device/logical_manager.go
+++ b/rw_core/core/device/logical_manager.go
@@ -61,11 +61,7 @@
 	for _, lDevice := range logicalDevices {
 		// Create an agent for each device
 		agent := newLogicalAgent(ctx, lDevice.Id, "", "", ldMgr, ldMgr.deviceMgr, ldMgr.dbPath, ldMgr.ldProxy, ldMgr.internalTimeout)
-		if err := agent.start(ctx, true, lDevice); err != nil {
-			logger.Warnw(ctx, "failure-starting-logical-agent", log.Fields{"logical-device-id": lDevice.Id})
-		} else {
-			ldMgr.logicalDeviceAgents.Store(agent.logicalDeviceID, agent)
-		}
+		go agent.start(ctx, true, lDevice)
 	}
 
 	probe.UpdateStatusFromContext(ctx, serviceName, probe.ServiceStatusRunning)
@@ -161,7 +157,6 @@
 	logger.Debugw(ctx, "logical-device-id", log.Fields{"logical-device-id": id})
 
 	agent := newLogicalAgent(ctx, id, sn, device.Id, ldMgr, ldMgr.deviceMgr, ldMgr.dbPath, ldMgr.ldProxy, ldMgr.internalTimeout)
-	ldMgr.addLogicalDeviceAgentToMap(agent)
 
 	// Update the root device with the logical device Id reference
 	if err := ldMgr.deviceMgr.setParentID(ctx, device, id); err != nil {
@@ -169,11 +164,7 @@
 		return nil, err
 	}
 
-	err := agent.start(ctx, false, nil)
-	if err != nil {
-		logger.Errorw(ctx, "unable-to-create-the-logical-device", log.Fields{"error": err})
-		ldMgr.deleteLogicalDeviceAgent(id)
-	}
+	go agent.start(ctx, false, nil)
 
 	logger.Debug(ctx, "creating-logical-device-ends")
 	return &id, nil
@@ -229,10 +220,7 @@
 			if _, err := ldMgr.getLogicalDeviceFromModel(ctx, lDeviceID); err == nil {
 				logger.Debugw(ctx, "loading-logical-device", log.Fields{"lDeviceId": lDeviceID})
 				agent := newLogicalAgent(ctx, lDeviceID, "", "", ldMgr, ldMgr.deviceMgr, ldMgr.dbPath, ldMgr.ldProxy, ldMgr.internalTimeout)
-				if err := agent.start(ctx, true, nil); err != nil {
-					return err
-				}
-				ldMgr.logicalDeviceAgents.Store(agent.logicalDeviceID, agent)
+				go agent.start(ctx, true, nil)
 			} else {
 				logger.Debugw(ctx, "logical-device-not-in-model", log.Fields{"logical-device-id": lDeviceID})
 			}