[VOL-4442] grpc streaming connection monitoring

Change-Id: I6b26a29c74be8833e7262eb59d266e6cce66f0c3
diff --git a/rw_core/core/adapter/agent.go b/rw_core/core/adapter/agent.go
index d030e72..e7c3d2b 100644
--- a/rw_core/core/adapter/agent.go
+++ b/rw_core/core/adapter/agent.go
@@ -25,8 +25,6 @@
 	vgrpc "github.com/opencord/voltha-lib-go/v7/pkg/grpc"
 	"github.com/opencord/voltha-lib-go/v7/pkg/log"
 	"github.com/opencord/voltha-protos/v5/go/adapter_service"
-	"github.com/opencord/voltha-protos/v5/go/common"
-	"github.com/opencord/voltha-protos/v5/go/health"
 	"github.com/opencord/voltha-protos/v5/go/voltha"
 	"google.golang.org/grpc"
 )
@@ -43,13 +41,11 @@
 	coreEndpoint       string
 }
 
-func setAndTestAdapterServiceHandler(ctx context.Context, conn *grpc.ClientConn, clientConn *common.Connection) interface{} {
-	svc := adapter_service.NewAdapterServiceClient(conn)
-	if h, err := svc.GetHealthStatus(ctx, clientConn); err != nil || h.State != health.HealthStatus_HEALTHY {
-		logger.Debugw(ctx, "remote-connection-not-ready", log.Fields{"error": err, "health": h, "requester": clientConn, "target": conn.Target()})
+func getAdapterServiceClientHandler(ctx context.Context, conn *grpc.ClientConn) interface{} {
+	if conn == nil {
 		return nil
 	}
-	return svc
+	return adapter_service.NewAdapterServiceClient(conn)
 }
 
 func newAdapterAgent(coreEndpoint string, adapter *voltha.Adapter, onAdapterRestart vgrpc.RestartedHandler, liveProbeInterval time.Duration) *agent {
@@ -68,6 +64,7 @@
 	if aa.vClient, err = vgrpc.NewClient(
 		aa.coreEndpoint,
 		aa.adapterAPIEndPoint,
+		"adapter_service.AdapterService",
 		aa.onAdapterRestart); err != nil {
 		return err
 	}
@@ -75,12 +72,13 @@
 	// Add a liveness communication update
 	aa.vClient.SubscribeForLiveness(aa.updateCommunicationTime)
 
-	go aa.vClient.Start(ctx, setAndTestAdapterServiceHandler)
+	go aa.vClient.Start(ctx, getAdapterServiceClientHandler)
 	return nil
 }
 
 func (aa *agent) stop(ctx context.Context) {
 	// Close the client
+	logger.Infow(ctx, "stopping-adapter-agent", log.Fields{"adapter": aa.adapter})
 	if aa.vClient != nil {
 		aa.vClient.Stop(ctx)
 	}
diff --git a/rw_core/core/adapter/manager.go b/rw_core/core/adapter/manager.go
index 258ff2a..f5a6cac 100644
--- a/rw_core/core/adapter/manager.go
+++ b/rw_core/core/adapter/manager.go
@@ -98,9 +98,18 @@
 	//	Stop all adapters
 	aMgr.lockAdapterAgentsMap.RLock()
 	defer aMgr.lockAdapterAgentsMap.RUnlock()
+	var wg sync.WaitGroup
 	for _, adapterAgent := range aMgr.adapterAgents {
-		adapterAgent.stop(ctx)
+		// Run the agent stop in its own go routine to notify to the
+		// adapters that the Core is no longer a client
+		wg.Add(1)
+		go func(agt *agent) {
+			agt.stop(ctx)
+			wg.Done()
+		}(adapterAgent)
 	}
+	// Wait for all tests to complete
+	wg.Wait()
 }
 
 func (aMgr *Manager) GetAdapterEndpoint(ctx context.Context, deviceID string, deviceType string) (string, error) {
diff --git a/rw_core/core/core.go b/rw_core/core/core.go
index 83282e3..c6ce503 100644
--- a/rw_core/core/core.go
+++ b/rw_core/core/core.go
@@ -42,6 +42,7 @@
 	Shutdown    context.CancelFunc
 	Stopped     chan struct{}
 	KafkaClient kafka.Client
+	adapterMgr  *adapter.Manager
 }
 
 const (
@@ -90,13 +91,6 @@
 func (core *Core) Start(ctx context.Context, id string, cf *config.RWCoreFlags) {
 	logger.Info(ctx, "starting-core-services", log.Fields{"coreId": id})
 
-	// deferred functions are used to run cleanup
-	// failing partway will stop anything that's been started
-	defer close(core.Stopped)
-	defer core.Shutdown()
-
-	logger.Info(ctx, "starting-rw-core-components")
-
 	// setup kv client
 	logger.Debugw(ctx, "create-kv-client", log.Fields{"kvstore": cf.KVStoreType})
 	kvClient, err := newKVClient(ctx, cf.KVStoreType, cf.KVStoreAddress, cf.KVStoreTimeout)
@@ -144,6 +138,10 @@
 	adapterMgr := adapter.NewAdapterManager(cf.GrpcSBIAddress, dbPath, id, backend, cf.LiveProbeInterval)
 	adapterMgr.Start(ctx, adapterService)
 
+	// We do not do a defer adapterMgr.Stop() here as we want this to be ran as soon as
+	// the core is stopped
+	core.adapterMgr = adapterMgr
+
 	// create the core of the system, the device managers
 	deviceMgr, logicalDeviceMgr := device.NewManagers(dbPath, adapterMgr, cf, id, eventProxy)
 
@@ -153,6 +151,7 @@
 	if err != nil {
 		logger.Fatalw(ctx, "failure-starting-device-manager", log.Fields{"error": err})
 	}
+	defer deviceMgr.Stop(ctx, deviceService)
 
 	// Start the logical device manager to load the logical devices.
 	logicalDeviceMgr.Start(ctx, logicalDeviceService)
@@ -183,9 +182,11 @@
 }
 
 // Stop brings down core services
-func (core *Core) Stop() {
+func (core *Core) Stop(ctx context.Context) {
+	// Close all the grpc clients connections to the adapters first
+	core.adapterMgr.Stop(ctx)
 	core.Shutdown()
-	<-core.Stopped
+	close(core.Stopped)
 }
 
 // startGrpcSbiService creates the grpc core service handlers, registers it to the grpc server and starts the server
diff --git a/rw_core/core/device/manager.go b/rw_core/core/device/manager.go
index 7d6e9da..07e92d0 100755
--- a/rw_core/core/device/manager.go
+++ b/rw_core/core/device/manager.go
@@ -58,6 +58,7 @@
 	devicesLoadingLock      sync.RWMutex
 	deviceLoadingInProgress map[string][]chan int
 	config                  *config.RWCoreFlags
+	doneCh                  chan struct{}
 }
 
 //NewManagers creates the Manager and the Logical Manager.
@@ -74,6 +75,7 @@
 		Agent:                   event.NewAgent(eventProxy, coreInstanceID, cf.VolthaStackID),
 		deviceLoadingInProgress: make(map[string][]chan int),
 		config:                  cf,
+		doneCh:                  make(chan struct{}),
 	}
 	deviceMgr.stateTransitions = state.NewTransitionMap(deviceMgr)
 
@@ -128,6 +130,12 @@
 	return nil
 }
 
+func (dMgr *Manager) Stop(ctx context.Context, serviceName string) {
+	logger.Info(ctx, "stopping-device-manager")
+	close(dMgr.doneCh)
+	probe.UpdateStatusFromContext(ctx, serviceName, probe.ServiceStatusStopped)
+}
+
 func (dMgr *Manager) addDeviceAgentToMap(agent *Agent) {
 	if _, exist := dMgr.deviceAgents.Load(agent.deviceID); !exist {
 		dMgr.deviceAgents.Store(agent.deviceID, agent)
diff --git a/rw_core/core/device/manager_sbi.go b/rw_core/core/device/manager_sbi.go
index dcf6599..7a17d6d 100644
--- a/rw_core/core/device/manager_sbi.go
+++ b/rw_core/core/device/manager_sbi.go
@@ -17,23 +17,21 @@
 
 import (
 	"context"
+	"fmt"
+	"time"
 
 	"github.com/golang/protobuf/ptypes/empty"
 	"github.com/opencord/voltha-go/rw_core/utils"
 	"github.com/opencord/voltha-lib-go/v7/pkg/log"
 	"github.com/opencord/voltha-protos/v5/go/common"
 	ca "github.com/opencord/voltha-protos/v5/go/core_adapter"
+	"github.com/opencord/voltha-protos/v5/go/core_service"
 	"github.com/opencord/voltha-protos/v5/go/health"
 	"github.com/opencord/voltha-protos/v5/go/voltha"
 	"google.golang.org/grpc/codes"
 	"google.golang.org/grpc/status"
 )
 
-func (dMgr *Manager) GetHealthStatus(ctx context.Context, clientConn *common.Connection) (*health.HealthStatus, error) {
-	logger.Debugw(ctx, "get-health-status-from-remote", log.Fields{"remote-client": clientConn})
-	return &health.HealthStatus{State: health.HealthStatus_HEALTHY}, nil
-}
-
 func (dMgr *Manager) PortCreated(ctx context.Context, port *voltha.Port) (*empty.Empty, error) {
 	ctx = utils.WithNewSpanAndRPCMetadataContext(ctx, "PortCreated")
 
@@ -533,3 +531,45 @@
 	}
 	return &empty.Empty{}, nil
 }
+
+func (dMgr *Manager) GetHealthStatus(stream core_service.CoreService_GetHealthStatusServer) error {
+	ctx := utils.WithRPCMetadataContext(context.Background(), "keep-alive-connection")
+	logger.Debugw(ctx, "receive-stream-connection", log.Fields{"stream": stream})
+
+	if stream == nil {
+		return fmt.Errorf("conn-is-nil %v", stream)
+	}
+	initialRequestTime := time.Now()
+	var remoteClient *common.Connection
+	var tempClient *common.Connection
+	var err error
+loop:
+	for {
+		tempClient, err = stream.Recv()
+		if err != nil {
+			logger.Warnw(ctx, "received-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+			break loop
+		}
+		// Send a response back
+		err = stream.Send(&health.HealthStatus{State: health.HealthStatus_HEALTHY})
+		if err != nil {
+			logger.Warnw(ctx, "sending-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+			break loop
+		}
+
+		remoteClient = tempClient
+		logger.Debugw(ctx, "received-keep-alive", log.Fields{"remote-client": remoteClient})
+
+		select {
+		case <-stream.Context().Done():
+			logger.Infow(ctx, "stream-keep-alive-context-done", log.Fields{"remote-client": remoteClient, "error": stream.Context().Err()})
+			break loop
+		case <-dMgr.doneCh:
+			logger.Warnw(ctx, "received-stop", log.Fields{"remote-client": remoteClient, "initial-conn-time": initialRequestTime})
+			break loop
+		default:
+		}
+	}
+	logger.Errorw(ctx, "connection-down", log.Fields{"remote-client": remoteClient, "error": err, "initial-conn-time": initialRequestTime})
+	return err
+}
diff --git a/rw_core/main.go b/rw_core/main.go
index e8bc6ab..8a9675d 100644
--- a/rw_core/main.go
+++ b/rw_core/main.go
@@ -137,7 +137,7 @@
 	logger.Infow(ctx, "received-a-closing-signal", log.Fields{"code": code})
 
 	// Cleanup before leaving
-	core.Stop()
+	core.Stop(shutdownCtx)
 
 	elapsed := time.Since(start)
 	logger.Infow(ctx, "rw-core-run-time", log.Fields{"core": instanceID, "time": elapsed / time.Second})
diff --git a/rw_core/mocks/adapter.go b/rw_core/mocks/adapter.go
index b99ca52..5a9130e 100644
--- a/rw_core/mocks/adapter.go
+++ b/rw_core/mocks/adapter.go
@@ -169,12 +169,11 @@
 	ta.Probe.UpdateStatus(ctx, serviceName, probe.ServiceStatusStopped)
 }
 
-func setAndTestCoreServiceHandler(ctx context.Context, conn *grpc.ClientConn, clientConn *common.Connection) interface{} {
-	svc := core_service.NewCoreServiceClient(conn)
-	if h, err := svc.GetHealthStatus(ctx, clientConn); err != nil || h.State != health.HealthStatus_HEALTHY {
+func setCoreServiceHandler(ctx context.Context, conn *grpc.ClientConn) interface{} {
+	if conn == nil {
 		return nil
 	}
-	return svc
+	return core_service.NewCoreServiceClient(conn)
 }
 
 // gRPC service
diff --git a/rw_core/mocks/adapter_olt.go b/rw_core/mocks/adapter_olt.go
index 96dfb33..0e8c58e 100644
--- a/rw_core/mocks/adapter_olt.go
+++ b/rw_core/mocks/adapter_olt.go
@@ -22,11 +22,14 @@
 	"fmt"
 	"strconv"
 	"strings"
+	"time"
 
 	"github.com/golang/protobuf/ptypes/empty"
 	"github.com/opencord/voltha-lib-go/v7/pkg/probe"
+	"github.com/opencord/voltha-protos/v5/go/adapter_service"
 	"github.com/opencord/voltha-protos/v5/go/common"
 	"github.com/opencord/voltha-protos/v5/go/extension"
+	"github.com/opencord/voltha-protos/v5/go/health"
 	"github.com/opencord/voltha-protos/v5/go/omci"
 	"github.com/phayes/freeport"
 
@@ -92,13 +95,14 @@
 
 	// Establish grpc connection to Core
 	if oltA.coreClient, err = vgrpc.NewClient(
-		"olt-endpoint",
+		"mock-olt-endpoint",
 		oltA.coreEnpoint,
+		"core_service.CoreService",
 		oltA.oltRestarted); err != nil {
 		logger.Fatal(ctx, "grpc-client-not-created")
 	}
 
-	go oltA.coreClient.Start(probeCtx, setAndTestCoreServiceHandler)
+	go oltA.coreClient.Start(probeCtx, setCoreServiceHandler)
 
 	logger.Debugw(ctx, "OLTAdapter-started", log.Fields{"grpc-address": oltA.serviceEndpoint})
 
@@ -431,3 +435,40 @@
 	}
 
 }
+
+func (oltA *OLTAdapter) GetHealthStatus(stream adapter_service.AdapterService_GetHealthStatusServer) error {
+	ctx := context.Background()
+	logger.Debugw(ctx, "receive-stream-connection", log.Fields{"stream": stream})
+
+	if stream == nil {
+		return fmt.Errorf("conn-is-nil %v", stream)
+	}
+	initialRequestTime := time.Now()
+	var remoteClient *common.Connection
+	var tempClient *common.Connection
+	var err error
+loop:
+	for {
+		tempClient, err = stream.Recv()
+		if err != nil {
+			logger.Warnw(ctx, "received-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+			break loop
+		}
+		err = stream.Send(&health.HealthStatus{State: health.HealthStatus_HEALTHY})
+		if err != nil {
+			logger.Warnw(ctx, "sending-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+			break loop
+		}
+		remoteClient = tempClient
+		logger.Debugw(ctx, "received-keep-alive", log.Fields{"remote-client": remoteClient})
+
+		select {
+		case <-stream.Context().Done():
+			logger.Infow(ctx, "stream-keep-alive-context-done", log.Fields{"remote-client": remoteClient, "error": stream.Context().Err()})
+			break loop
+		default:
+		}
+	}
+	logger.Errorw(ctx, "connection-down", log.Fields{"remote-client": remoteClient, "error": err, "initial-conn-time": initialRequestTime})
+	return err
+}
diff --git a/rw_core/mocks/adapter_onu.go b/rw_core/mocks/adapter_onu.go
index d5669c3..dcfc60c 100644
--- a/rw_core/mocks/adapter_onu.go
+++ b/rw_core/mocks/adapter_onu.go
@@ -21,13 +21,16 @@
 	"fmt"
 	"strconv"
 	"strings"
+	"time"
 
 	"github.com/golang/protobuf/ptypes/empty"
 	vgrpc "github.com/opencord/voltha-lib-go/v7/pkg/grpc"
 	"github.com/opencord/voltha-lib-go/v7/pkg/probe"
+	"github.com/opencord/voltha-protos/v5/go/adapter_service"
 	"github.com/opencord/voltha-protos/v5/go/common"
 	ca "github.com/opencord/voltha-protos/v5/go/core_adapter"
 	"github.com/opencord/voltha-protos/v5/go/extension"
+	"github.com/opencord/voltha-protos/v5/go/health"
 	"github.com/phayes/freeport"
 
 	"github.com/gogo/protobuf/proto"
@@ -87,12 +90,13 @@
 
 	// Establish grpc connection to Core
 	if onuA.coreClient, err = vgrpc.NewClient(
-		"onu-endpoint",
+		"mock-onu-endpoint",
 		onuA.coreEnpoint,
+		"core_service.CoreService",
 		onuA.onuRestarted); err != nil {
 		logger.Fatal(ctx, "grpc-client-not-created")
 	}
-	go onuA.coreClient.Start(probeCtx, setAndTestCoreServiceHandler)
+	go onuA.coreClient.Start(probeCtx, setCoreServiceHandler)
 
 	logger.Debugw(ctx, "ONUAdapter-started", log.Fields{"grpc-address": onuA.serviceEndpoint})
 }
@@ -297,3 +301,40 @@
 func (onuA *ONUAdapter) StartOmciTest(ctx context.Context, _ *ca.OMCITest) (*omci.TestResponse, error) { // nolint
 	return &omci.TestResponse{Result: omci.TestResponse_SUCCESS}, nil
 }
+
+func (onuA *ONUAdapter) GetHealthStatus(stream adapter_service.AdapterService_GetHealthStatusServer) error {
+	ctx := context.Background()
+	logger.Debugw(ctx, "receive-stream-connection", log.Fields{"stream": stream})
+
+	if stream == nil {
+		return fmt.Errorf("conn-is-nil %v", stream)
+	}
+	initialRequestTime := time.Now()
+	var remoteClient *common.Connection
+	var tempClient *common.Connection
+	var err error
+loop:
+	for {
+		tempClient, err = stream.Recv()
+		if err != nil {
+			logger.Warnw(ctx, "received-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+			break loop
+		}
+		err = stream.Send(&health.HealthStatus{State: health.HealthStatus_HEALTHY})
+		if err != nil {
+			logger.Warnw(ctx, "sending-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+			break loop
+		}
+		remoteClient = tempClient
+		logger.Debugw(ctx, "received-keep-alive", log.Fields{"remote-client": remoteClient})
+
+		select {
+		case <-stream.Context().Done():
+			logger.Infow(ctx, "stream-keep-alive-context-done", log.Fields{"remote-client": remoteClient, "error": stream.Context().Err()})
+			break loop
+		default:
+		}
+	}
+	logger.Errorw(ctx, "connection-down", log.Fields{"remote-client": remoteClient, "error": err, "initial-conn-time": initialRequestTime})
+	return err
+}
diff --git a/rw_core/test/core_nbi_handler_multi_test.go b/rw_core/test/core_nbi_handler_multi_test.go
index 2856744..6f5a37e 100755
--- a/rw_core/test/core_nbi_handler_multi_test.go
+++ b/rw_core/test/core_nbi_handler_multi_test.go
@@ -263,7 +263,7 @@
 		}
 	}
 	if nb.core != nil {
-		nb.core.Stop()
+		nb.core.Stop(ctx)
 	}
 }