[VOL-4442] grpc streaming connection monitoring
Change-Id: I8a361473a252f6d2b64578a97980b2b7b3618f55
diff --git a/internal/pkg/core/openonu.go b/internal/pkg/core/openonu.go
index 2c9ebd4..890b92d 100755
--- a/internal/pkg/core/openonu.go
+++ b/internal/pkg/core/openonu.go
@@ -29,6 +29,7 @@
vgrpc "github.com/opencord/voltha-lib-go/v7/pkg/grpc"
conf "github.com/opencord/voltha-lib-go/v7/pkg/config"
+ "github.com/opencord/voltha-protos/v5/go/adapter_service"
"github.com/opencord/voltha-protos/v5/go/common"
"github.com/opencord/voltha-protos/v5/go/health"
"github.com/opencord/voltha-protos/v5/go/olt_inter_adapter_service"
@@ -163,15 +164,14 @@
return nil
}
-/*
-//stop terminates the session
-func (oo *OpenONUAC) stop(ctx context.Context) error {
- logger.Info(ctx,"stopping-device-manager")
- oo.exitChannel <- 1
- logger.Info(ctx,"device-manager-stopped")
+//Stop terminates the session
+func (oo *OpenONUAC) Stop(ctx context.Context) error {
+ logger.Info(ctx, "stopping-device-manager")
+ close(oo.exitChannel)
+ oo.stopAllGrpcClients(ctx)
+ logger.Info(ctx, "device-manager-stopped")
return nil
}
-*/
func (oo *OpenONUAC) addDeviceHandlerToMap(ctx context.Context, agent *deviceHandler) {
oo.mutexDeviceHandlersMap.Lock()
@@ -223,14 +223,6 @@
return agent
}
-// GetHealthStatus is used as a service readiness validation as a grpc connection
-func (oo *OpenONUAC) GetHealthStatus(ctx context.Context, clientConn *common.Connection) (*health.HealthStatus, error) {
- // Update the remote reachability
- oo.updateReachabilityFromRemote(ctx, clientConn)
-
- return &health.HealthStatus{State: health.HealthStatus_HEALTHY}, nil
-}
-
// AdoptDevice creates a new device handler if not present already and then adopts the device
func (oo *OpenONUAC) AdoptDevice(ctx context.Context, device *voltha.Device) (*empty.Empty, error) {
if device == nil {
@@ -263,7 +255,7 @@
logger.Infow(ctx, "reconcile-device", log.Fields{"device-id": device.Id, "parent-id": device.ParentId})
// Check whether the grpc client in the adapter of the parent device can reach us yet
- if !oo.isReachableFromRemote(device.ProxyAddress.AdapterEndpoint, device.ParentId) {
+ if !oo.isReachableFromRemote(ctx, device.ProxyAddress.AdapterEndpoint, device.ProxyAddress.DeviceId) {
return nil, status.Errorf(codes.Unavailable, "adapter-not-reachable-from-parent-%s", device.ProxyAddress.AdapterEndpoint)
}
@@ -988,11 +980,13 @@
oo.reachableFromRemote[endpointHash] = &reachabilityFromRemote{lastKeepAlive: time.Now(), keepAliveInterval: remote.KeepAliveInterval}
}
-func (oo *OpenONUAC) isReachableFromRemote(endpoint string, contextInfo string) bool {
+func (oo *OpenONUAC) isReachableFromRemote(ctx context.Context, endpoint string, contextInfo string) bool {
+ logger.Debugw(ctx, "checking-remote-reachability", log.Fields{"endpoint": endpoint, "context": contextInfo})
oo.lockReachableFromRemote.RLock()
defer oo.lockReachableFromRemote.RUnlock()
endpointHash := getHash(endpoint, contextInfo)
if _, ok := oo.reachableFromRemote[endpointHash]; ok {
+ logger.Debugw(ctx, "endpoint-exists", log.Fields{"last-keep-alive": time.Since(oo.reachableFromRemote[endpointHash].lastKeepAlive)})
// Assume the connection is down if we did not receive 2 keep alives in succession
maxKeepAliveWait := time.Duration(oo.reachableFromRemote[endpointHash].keepAliveInterval * 2)
return time.Since(oo.reachableFromRemote[endpointHash].lastKeepAlive) <= maxKeepAliveWait
@@ -1000,6 +994,20 @@
return false
}
+//stopAllGrpcClients stops all grpc clients in use
+func (oo *OpenONUAC) stopAllGrpcClients(ctx context.Context) {
+ // Stop the clients that connect to the parent
+ oo.lockParentAdapterClients.Lock()
+ for key := range oo.parentAdapterClients {
+ oo.parentAdapterClients[key].Stop(ctx)
+ delete(oo.parentAdapterClients, key)
+ }
+ oo.lockParentAdapterClients.Unlock()
+
+ // Stop core client connection
+ oo.coreClient.Stop(ctx)
+}
+
func (oo *OpenONUAC) setupParentInterAdapterClient(ctx context.Context, endpoint string) error {
logger.Infow(ctx, "setting-parent-adapter-connection", log.Fields{"parent-endpoint": endpoint})
oo.lockParentAdapterClients.Lock()
@@ -1011,6 +1019,7 @@
childClient, err := vgrpc.NewClient(
oo.config.AdapterEndpoint,
endpoint,
+ "olt_inter_adapter_service.OltInterAdapterService",
oo.oltAdapterRestarted)
if err != nil {
@@ -1019,7 +1028,7 @@
oo.parentAdapterClients[endpoint] = childClient
- go oo.parentAdapterClients[endpoint].Start(log.WithSpanFromContext(context.TODO(), ctx), setAndTestOltInterAdapterServiceHandler)
+ go oo.parentAdapterClients[endpoint].Start(log.WithSpanFromContext(context.TODO(), ctx), getOltInterAdapterServiceClientHandler)
// Wait until we have a connection to the child adapter.
// Unlimited retries or until context expires
@@ -1074,13 +1083,12 @@
return nil
}
-// setAndTestOltInterAdapterServiceHandler is used to test whether the remote gRPC service is up
-func setAndTestOltInterAdapterServiceHandler(ctx context.Context, conn *grpc.ClientConn, clientConn *common.Connection) interface{} {
- svc := olt_inter_adapter_service.NewOltInterAdapterServiceClient(conn)
- if h, err := svc.GetHealthStatus(ctx, clientConn); err != nil || h.State != health.HealthStatus_HEALTHY {
+// getOltInterAdapterServiceClientHandler is used to setup the remote gRPC service
+func getOltInterAdapterServiceClientHandler(ctx context.Context, conn *grpc.ClientConn) interface{} {
+ if conn == nil {
return nil
}
- return svc
+ return olt_inter_adapter_service.NewOltInterAdapterServiceClient(conn)
}
func (oo *OpenONUAC) forceDeleteDeviceKvData(ctx context.Context, aDeviceID string) error {
@@ -1126,6 +1134,49 @@
return nil
}
+// GetHealthStatus is used by the voltha core to open a streaming connection with the onu adapter.
+func (oo *OpenONUAC) GetHealthStatus(stream adapter_service.AdapterService_GetHealthStatusServer) error {
+ ctx := context.Background()
+ logger.Debugw(ctx, "receive-stream-connection", log.Fields{"stream": stream})
+
+ if stream == nil {
+ return fmt.Errorf("conn-is-nil %v", stream)
+ }
+ initialRequestTime := time.Now()
+ var remoteClient *common.Connection
+ var tempClient *common.Connection
+ var err error
+loop:
+ for {
+ tempClient, err = stream.Recv()
+ if err != nil {
+ logger.Warnw(ctx, "received-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+ break loop
+ }
+ // Send a response back
+ err = stream.Send(&health.HealthStatus{State: health.HealthStatus_HEALTHY})
+ if err != nil {
+ logger.Warnw(ctx, "sending-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+ break loop
+ }
+ remoteClient = tempClient
+
+ logger.Debugw(ctx, "received-keep-alive", log.Fields{"remote-client": remoteClient})
+
+ select {
+ case <-stream.Context().Done():
+ logger.Infow(ctx, "stream-keep-alive-context-done", log.Fields{"remote-client": remoteClient, "error": stream.Context().Err()})
+ break loop
+ case <-oo.exitChannel:
+ logger.Warnw(ctx, "received-stop", log.Fields{"remote-client": remoteClient, "initial-conn-time": initialRequestTime})
+ break loop
+ default:
+ }
+ }
+ logger.Errorw(ctx, "connection-down", log.Fields{"remote-client": remoteClient, "error": err, "initial-conn-time": initialRequestTime})
+ return err
+}
+
/*
*
* Unimplemented APIs