[VOL-4442] grpc streaming connection monitoring

Change-Id: I8a361473a252f6d2b64578a97980b2b7b3618f55
diff --git a/internal/pkg/core/openonu.go b/internal/pkg/core/openonu.go
index 2c9ebd4..890b92d 100755
--- a/internal/pkg/core/openonu.go
+++ b/internal/pkg/core/openonu.go
@@ -29,6 +29,7 @@
 	vgrpc "github.com/opencord/voltha-lib-go/v7/pkg/grpc"
 
 	conf "github.com/opencord/voltha-lib-go/v7/pkg/config"
+	"github.com/opencord/voltha-protos/v5/go/adapter_service"
 	"github.com/opencord/voltha-protos/v5/go/common"
 	"github.com/opencord/voltha-protos/v5/go/health"
 	"github.com/opencord/voltha-protos/v5/go/olt_inter_adapter_service"
@@ -163,15 +164,14 @@
 	return nil
 }
 
-/*
-//stop terminates the session
-func (oo *OpenONUAC) stop(ctx context.Context) error {
-	logger.Info(ctx,"stopping-device-manager")
-	oo.exitChannel <- 1
-	logger.Info(ctx,"device-manager-stopped")
+//Stop terminates the session
+func (oo *OpenONUAC) Stop(ctx context.Context) error {
+	logger.Info(ctx, "stopping-device-manager")
+	close(oo.exitChannel)
+	oo.stopAllGrpcClients(ctx)
+	logger.Info(ctx, "device-manager-stopped")
 	return nil
 }
-*/
 
 func (oo *OpenONUAC) addDeviceHandlerToMap(ctx context.Context, agent *deviceHandler) {
 	oo.mutexDeviceHandlersMap.Lock()
@@ -223,14 +223,6 @@
 	return agent
 }
 
-// GetHealthStatus is used as a service readiness validation as a grpc connection
-func (oo *OpenONUAC) GetHealthStatus(ctx context.Context, clientConn *common.Connection) (*health.HealthStatus, error) {
-	// Update the remote reachability
-	oo.updateReachabilityFromRemote(ctx, clientConn)
-
-	return &health.HealthStatus{State: health.HealthStatus_HEALTHY}, nil
-}
-
 // AdoptDevice creates a new device handler if not present already and then adopts the device
 func (oo *OpenONUAC) AdoptDevice(ctx context.Context, device *voltha.Device) (*empty.Empty, error) {
 	if device == nil {
@@ -263,7 +255,7 @@
 	logger.Infow(ctx, "reconcile-device", log.Fields{"device-id": device.Id, "parent-id": device.ParentId})
 
 	// Check whether the grpc client in the adapter of the parent device can reach us yet
-	if !oo.isReachableFromRemote(device.ProxyAddress.AdapterEndpoint, device.ParentId) {
+	if !oo.isReachableFromRemote(ctx, device.ProxyAddress.AdapterEndpoint, device.ProxyAddress.DeviceId) {
 		return nil, status.Errorf(codes.Unavailable, "adapter-not-reachable-from-parent-%s", device.ProxyAddress.AdapterEndpoint)
 	}
 
@@ -988,11 +980,13 @@
 	oo.reachableFromRemote[endpointHash] = &reachabilityFromRemote{lastKeepAlive: time.Now(), keepAliveInterval: remote.KeepAliveInterval}
 }
 
-func (oo *OpenONUAC) isReachableFromRemote(endpoint string, contextInfo string) bool {
+func (oo *OpenONUAC) isReachableFromRemote(ctx context.Context, endpoint string, contextInfo string) bool {
+	logger.Debugw(ctx, "checking-remote-reachability", log.Fields{"endpoint": endpoint, "context": contextInfo})
 	oo.lockReachableFromRemote.RLock()
 	defer oo.lockReachableFromRemote.RUnlock()
 	endpointHash := getHash(endpoint, contextInfo)
 	if _, ok := oo.reachableFromRemote[endpointHash]; ok {
+		logger.Debugw(ctx, "endpoint-exists", log.Fields{"last-keep-alive": time.Since(oo.reachableFromRemote[endpointHash].lastKeepAlive)})
 		// Assume the connection is down if we did not receive 2 keep alives in succession
 		maxKeepAliveWait := time.Duration(oo.reachableFromRemote[endpointHash].keepAliveInterval * 2)
 		return time.Since(oo.reachableFromRemote[endpointHash].lastKeepAlive) <= maxKeepAliveWait
@@ -1000,6 +994,20 @@
 	return false
 }
 
+//stopAllGrpcClients stops all grpc clients in use
+func (oo *OpenONUAC) stopAllGrpcClients(ctx context.Context) {
+	// Stop the clients that connect to the parent
+	oo.lockParentAdapterClients.Lock()
+	for key := range oo.parentAdapterClients {
+		oo.parentAdapterClients[key].Stop(ctx)
+		delete(oo.parentAdapterClients, key)
+	}
+	oo.lockParentAdapterClients.Unlock()
+
+	// Stop core client connection
+	oo.coreClient.Stop(ctx)
+}
+
 func (oo *OpenONUAC) setupParentInterAdapterClient(ctx context.Context, endpoint string) error {
 	logger.Infow(ctx, "setting-parent-adapter-connection", log.Fields{"parent-endpoint": endpoint})
 	oo.lockParentAdapterClients.Lock()
@@ -1011,6 +1019,7 @@
 	childClient, err := vgrpc.NewClient(
 		oo.config.AdapterEndpoint,
 		endpoint,
+		"olt_inter_adapter_service.OltInterAdapterService",
 		oo.oltAdapterRestarted)
 
 	if err != nil {
@@ -1019,7 +1028,7 @@
 
 	oo.parentAdapterClients[endpoint] = childClient
 
-	go oo.parentAdapterClients[endpoint].Start(log.WithSpanFromContext(context.TODO(), ctx), setAndTestOltInterAdapterServiceHandler)
+	go oo.parentAdapterClients[endpoint].Start(log.WithSpanFromContext(context.TODO(), ctx), getOltInterAdapterServiceClientHandler)
 
 	// Wait until we have a connection to the child adapter.
 	// Unlimited retries or until context expires
@@ -1074,13 +1083,12 @@
 	return nil
 }
 
-// setAndTestOltInterAdapterServiceHandler is used to test whether the remote gRPC service is up
-func setAndTestOltInterAdapterServiceHandler(ctx context.Context, conn *grpc.ClientConn, clientConn *common.Connection) interface{} {
-	svc := olt_inter_adapter_service.NewOltInterAdapterServiceClient(conn)
-	if h, err := svc.GetHealthStatus(ctx, clientConn); err != nil || h.State != health.HealthStatus_HEALTHY {
+// getOltInterAdapterServiceClientHandler is used to setup the remote gRPC service
+func getOltInterAdapterServiceClientHandler(ctx context.Context, conn *grpc.ClientConn) interface{} {
+	if conn == nil {
 		return nil
 	}
-	return svc
+	return olt_inter_adapter_service.NewOltInterAdapterServiceClient(conn)
 }
 
 func (oo *OpenONUAC) forceDeleteDeviceKvData(ctx context.Context, aDeviceID string) error {
@@ -1126,6 +1134,49 @@
 	return nil
 }
 
+// GetHealthStatus is used by the voltha core to open a streaming connection with the onu adapter.
+func (oo *OpenONUAC) GetHealthStatus(stream adapter_service.AdapterService_GetHealthStatusServer) error {
+	ctx := context.Background()
+	logger.Debugw(ctx, "receive-stream-connection", log.Fields{"stream": stream})
+
+	if stream == nil {
+		return fmt.Errorf("conn-is-nil %v", stream)
+	}
+	initialRequestTime := time.Now()
+	var remoteClient *common.Connection
+	var tempClient *common.Connection
+	var err error
+loop:
+	for {
+		tempClient, err = stream.Recv()
+		if err != nil {
+			logger.Warnw(ctx, "received-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+			break loop
+		}
+		// Send a response back
+		err = stream.Send(&health.HealthStatus{State: health.HealthStatus_HEALTHY})
+		if err != nil {
+			logger.Warnw(ctx, "sending-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+			break loop
+		}
+		remoteClient = tempClient
+
+		logger.Debugw(ctx, "received-keep-alive", log.Fields{"remote-client": remoteClient})
+
+		select {
+		case <-stream.Context().Done():
+			logger.Infow(ctx, "stream-keep-alive-context-done", log.Fields{"remote-client": remoteClient, "error": stream.Context().Err()})
+			break loop
+		case <-oo.exitChannel:
+			logger.Warnw(ctx, "received-stop", log.Fields{"remote-client": remoteClient, "initial-conn-time": initialRequestTime})
+			break loop
+		default:
+		}
+	}
+	logger.Errorw(ctx, "connection-down", log.Fields{"remote-client": remoteClient, "error": err, "initial-conn-time": initialRequestTime})
+	return err
+}
+
 /*
  *
  * Unimplemented APIs
diff --git a/internal/pkg/core/openonuInterAdapter.go b/internal/pkg/core/openonuInterAdapter.go
new file mode 100644
index 0000000..8e9bdec
--- /dev/null
+++ b/internal/pkg/core/openonuInterAdapter.go
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2022-present Open Networking Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//Package core provides the utility for onu devices, flows and statistics
+package core
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/golang/protobuf/ptypes/empty"
+	"github.com/opencord/voltha-lib-go/v7/pkg/log"
+	"github.com/opencord/voltha-protos/v5/go/common"
+	"github.com/opencord/voltha-protos/v5/go/health"
+	ia "github.com/opencord/voltha-protos/v5/go/inter_adapter"
+	"github.com/opencord/voltha-protos/v5/go/onu_inter_adapter_service"
+)
+
+//OpenONUACInterAdapter structure holds a reference to ONU adapter
+type OpenONUACInterAdapter struct {
+	onuAdapter  *OpenONUAC
+	exitChannel chan struct{}
+}
+
+//NewOpenONUACAdapter returns a new instance of OpenONUACAdapter
+func NewOpenONUACAdapter(ctx context.Context, onuAdapter *OpenONUAC) *OpenONUACInterAdapter {
+	return &OpenONUACInterAdapter{onuAdapter: onuAdapter, exitChannel: make(chan struct{})}
+}
+
+//Start starts (logs) the adapter
+func (oo *OpenONUACInterAdapter) Start(ctx context.Context) error {
+	logger.Info(ctx, "starting-openonu-inter-adapter")
+	return nil
+}
+
+//OnuIndication redirects the request the the core ONU adapter handler
+func (oo *OpenONUACInterAdapter) OnuIndication(ctx context.Context, onuInd *ia.OnuIndicationMessage) (*empty.Empty, error) {
+	return oo.onuAdapter.OnuIndication(ctx, onuInd)
+}
+
+//OmciIndication redirects the request the the core ONU adapter handler
+func (oo *OpenONUACInterAdapter) OmciIndication(ctx context.Context, msg *ia.OmciMessage) (*empty.Empty, error) {
+	return oo.onuAdapter.OmciIndication(ctx, msg)
+}
+
+//DownloadTechProfile redirects the request the the core ONU adapter handler
+func (oo *OpenONUACInterAdapter) DownloadTechProfile(ctx context.Context, tProfile *ia.TechProfileDownloadMessage) (*empty.Empty, error) {
+	return oo.onuAdapter.DownloadTechProfile(ctx, tProfile)
+}
+
+//DeleteGemPort redirects the request the the core ONU adapter handler
+func (oo *OpenONUACInterAdapter) DeleteGemPort(ctx context.Context, gPort *ia.DeleteGemPortMessage) (*empty.Empty, error) {
+	return oo.onuAdapter.DeleteGemPort(ctx, gPort)
+}
+
+//DeleteTCont redirects the request the the core ONU adapter handler
+func (oo *OpenONUACInterAdapter) DeleteTCont(ctx context.Context, tConf *ia.DeleteTcontMessage) (*empty.Empty, error) {
+	return oo.onuAdapter.DeleteTCont(ctx, tConf)
+}
+
+//Stop terminates the session
+func (oo *OpenONUACInterAdapter) Stop(ctx context.Context) error {
+	close(oo.exitChannel)
+	logger.Info(ctx, "openonu-inter-adapter-stopped")
+	return nil
+}
+
+// GetHealthStatus is used by a OnuInterAdapterService client to detect a connection
+// lost with the gRPC server hosting the OnuInterAdapterService service
+func (oo *OpenONUACInterAdapter) GetHealthStatus(stream onu_inter_adapter_service.OnuInterAdapterService_GetHealthStatusServer) error {
+	ctx := context.Background()
+	logger.Debugw(ctx, "receive-stream-connection", log.Fields{"stream": stream})
+
+	if stream == nil {
+		return fmt.Errorf("conn-is-nil %v", stream)
+	}
+	initialRequestTime := time.Now()
+	var remoteClient *common.Connection
+	var tempClient *common.Connection
+	var err error
+loop:
+	for {
+		tempClient, err = stream.Recv()
+		if err != nil {
+			logger.Warnw(ctx, "received-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+			break loop
+		}
+		// Send a response back
+		err = stream.Send(&health.HealthStatus{State: health.HealthStatus_HEALTHY})
+		if err != nil {
+			logger.Warnw(ctx, "sending-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+			break loop
+		}
+
+		remoteClient = tempClient
+		logger.Debugw(ctx, "received-keep-alive", log.Fields{"remote-client": remoteClient})
+		oo.onuAdapter.updateReachabilityFromRemote(context.Background(), remoteClient)
+
+		select {
+		case <-stream.Context().Done():
+			logger.Infow(ctx, "stream-keep-alive-context-done", log.Fields{"remote-client": remoteClient, "error": stream.Context().Err()})
+			break loop
+		case <-oo.exitChannel:
+			logger.Warnw(ctx, "received-stop", log.Fields{"remote-client": remoteClient, "initial-conn-time": initialRequestTime})
+			break loop
+		default:
+		}
+	}
+	logger.Errorw(ctx, "connection-down", log.Fields{"remote-client": remoteClient, "error": err, "initial-conn-time": initialRequestTime})
+	return err
+}