[VOL-4442] grpc streaming connection monitoring

Change-Id: I8a361473a252f6d2b64578a97980b2b7b3618f55
diff --git a/internal/pkg/core/openonu.go b/internal/pkg/core/openonu.go
index 2c9ebd4..890b92d 100755
--- a/internal/pkg/core/openonu.go
+++ b/internal/pkg/core/openonu.go
@@ -29,6 +29,7 @@
 	vgrpc "github.com/opencord/voltha-lib-go/v7/pkg/grpc"
 
 	conf "github.com/opencord/voltha-lib-go/v7/pkg/config"
+	"github.com/opencord/voltha-protos/v5/go/adapter_service"
 	"github.com/opencord/voltha-protos/v5/go/common"
 	"github.com/opencord/voltha-protos/v5/go/health"
 	"github.com/opencord/voltha-protos/v5/go/olt_inter_adapter_service"
@@ -163,15 +164,14 @@
 	return nil
 }
 
-/*
-//stop terminates the session
-func (oo *OpenONUAC) stop(ctx context.Context) error {
-	logger.Info(ctx,"stopping-device-manager")
-	oo.exitChannel <- 1
-	logger.Info(ctx,"device-manager-stopped")
+//Stop terminates the session
+func (oo *OpenONUAC) Stop(ctx context.Context) error {
+	logger.Info(ctx, "stopping-device-manager")
+	close(oo.exitChannel)
+	oo.stopAllGrpcClients(ctx)
+	logger.Info(ctx, "device-manager-stopped")
 	return nil
 }
-*/
 
 func (oo *OpenONUAC) addDeviceHandlerToMap(ctx context.Context, agent *deviceHandler) {
 	oo.mutexDeviceHandlersMap.Lock()
@@ -223,14 +223,6 @@
 	return agent
 }
 
-// GetHealthStatus is used as a service readiness validation as a grpc connection
-func (oo *OpenONUAC) GetHealthStatus(ctx context.Context, clientConn *common.Connection) (*health.HealthStatus, error) {
-	// Update the remote reachability
-	oo.updateReachabilityFromRemote(ctx, clientConn)
-
-	return &health.HealthStatus{State: health.HealthStatus_HEALTHY}, nil
-}
-
 // AdoptDevice creates a new device handler if not present already and then adopts the device
 func (oo *OpenONUAC) AdoptDevice(ctx context.Context, device *voltha.Device) (*empty.Empty, error) {
 	if device == nil {
@@ -263,7 +255,7 @@
 	logger.Infow(ctx, "reconcile-device", log.Fields{"device-id": device.Id, "parent-id": device.ParentId})
 
 	// Check whether the grpc client in the adapter of the parent device can reach us yet
-	if !oo.isReachableFromRemote(device.ProxyAddress.AdapterEndpoint, device.ParentId) {
+	if !oo.isReachableFromRemote(ctx, device.ProxyAddress.AdapterEndpoint, device.ProxyAddress.DeviceId) {
 		return nil, status.Errorf(codes.Unavailable, "adapter-not-reachable-from-parent-%s", device.ProxyAddress.AdapterEndpoint)
 	}
 
@@ -988,11 +980,13 @@
 	oo.reachableFromRemote[endpointHash] = &reachabilityFromRemote{lastKeepAlive: time.Now(), keepAliveInterval: remote.KeepAliveInterval}
 }
 
-func (oo *OpenONUAC) isReachableFromRemote(endpoint string, contextInfo string) bool {
+func (oo *OpenONUAC) isReachableFromRemote(ctx context.Context, endpoint string, contextInfo string) bool {
+	logger.Debugw(ctx, "checking-remote-reachability", log.Fields{"endpoint": endpoint, "context": contextInfo})
 	oo.lockReachableFromRemote.RLock()
 	defer oo.lockReachableFromRemote.RUnlock()
 	endpointHash := getHash(endpoint, contextInfo)
 	if _, ok := oo.reachableFromRemote[endpointHash]; ok {
+		logger.Debugw(ctx, "endpoint-exists", log.Fields{"last-keep-alive": time.Since(oo.reachableFromRemote[endpointHash].lastKeepAlive)})
 		// Assume the connection is down if we did not receive 2 keep alives in succession
 		maxKeepAliveWait := time.Duration(oo.reachableFromRemote[endpointHash].keepAliveInterval * 2)
 		return time.Since(oo.reachableFromRemote[endpointHash].lastKeepAlive) <= maxKeepAliveWait
@@ -1000,6 +994,20 @@
 	return false
 }
 
+//stopAllGrpcClients stops all grpc clients in use
+func (oo *OpenONUAC) stopAllGrpcClients(ctx context.Context) {
+	// Stop the clients that connect to the parent
+	oo.lockParentAdapterClients.Lock()
+	for key := range oo.parentAdapterClients {
+		oo.parentAdapterClients[key].Stop(ctx)
+		delete(oo.parentAdapterClients, key)
+	}
+	oo.lockParentAdapterClients.Unlock()
+
+	// Stop core client connection
+	oo.coreClient.Stop(ctx)
+}
+
 func (oo *OpenONUAC) setupParentInterAdapterClient(ctx context.Context, endpoint string) error {
 	logger.Infow(ctx, "setting-parent-adapter-connection", log.Fields{"parent-endpoint": endpoint})
 	oo.lockParentAdapterClients.Lock()
@@ -1011,6 +1019,7 @@
 	childClient, err := vgrpc.NewClient(
 		oo.config.AdapterEndpoint,
 		endpoint,
+		"olt_inter_adapter_service.OltInterAdapterService",
 		oo.oltAdapterRestarted)
 
 	if err != nil {
@@ -1019,7 +1028,7 @@
 
 	oo.parentAdapterClients[endpoint] = childClient
 
-	go oo.parentAdapterClients[endpoint].Start(log.WithSpanFromContext(context.TODO(), ctx), setAndTestOltInterAdapterServiceHandler)
+	go oo.parentAdapterClients[endpoint].Start(log.WithSpanFromContext(context.TODO(), ctx), getOltInterAdapterServiceClientHandler)
 
 	// Wait until we have a connection to the child adapter.
 	// Unlimited retries or until context expires
@@ -1074,13 +1083,12 @@
 	return nil
 }
 
-// setAndTestOltInterAdapterServiceHandler is used to test whether the remote gRPC service is up
-func setAndTestOltInterAdapterServiceHandler(ctx context.Context, conn *grpc.ClientConn, clientConn *common.Connection) interface{} {
-	svc := olt_inter_adapter_service.NewOltInterAdapterServiceClient(conn)
-	if h, err := svc.GetHealthStatus(ctx, clientConn); err != nil || h.State != health.HealthStatus_HEALTHY {
+// getOltInterAdapterServiceClientHandler is used to setup the remote gRPC service
+func getOltInterAdapterServiceClientHandler(ctx context.Context, conn *grpc.ClientConn) interface{} {
+	if conn == nil {
 		return nil
 	}
-	return svc
+	return olt_inter_adapter_service.NewOltInterAdapterServiceClient(conn)
 }
 
 func (oo *OpenONUAC) forceDeleteDeviceKvData(ctx context.Context, aDeviceID string) error {
@@ -1126,6 +1134,49 @@
 	return nil
 }
 
+// GetHealthStatus is used by the voltha core to open a streaming connection with the onu adapter.
+func (oo *OpenONUAC) GetHealthStatus(stream adapter_service.AdapterService_GetHealthStatusServer) error {
+	ctx := context.Background()
+	logger.Debugw(ctx, "receive-stream-connection", log.Fields{"stream": stream})
+
+	if stream == nil {
+		return fmt.Errorf("conn-is-nil %v", stream)
+	}
+	initialRequestTime := time.Now()
+	var remoteClient *common.Connection
+	var tempClient *common.Connection
+	var err error
+loop:
+	for {
+		tempClient, err = stream.Recv()
+		if err != nil {
+			logger.Warnw(ctx, "received-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+			break loop
+		}
+		// Send a response back
+		err = stream.Send(&health.HealthStatus{State: health.HealthStatus_HEALTHY})
+		if err != nil {
+			logger.Warnw(ctx, "sending-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+			break loop
+		}
+		remoteClient = tempClient
+
+		logger.Debugw(ctx, "received-keep-alive", log.Fields{"remote-client": remoteClient})
+
+		select {
+		case <-stream.Context().Done():
+			logger.Infow(ctx, "stream-keep-alive-context-done", log.Fields{"remote-client": remoteClient, "error": stream.Context().Err()})
+			break loop
+		case <-oo.exitChannel:
+			logger.Warnw(ctx, "received-stop", log.Fields{"remote-client": remoteClient, "initial-conn-time": initialRequestTime})
+			break loop
+		default:
+		}
+	}
+	logger.Errorw(ctx, "connection-down", log.Fields{"remote-client": remoteClient, "error": err, "initial-conn-time": initialRequestTime})
+	return err
+}
+
 /*
  *
  * Unimplemented APIs