[VOL-4442] grpc streaming connection monitoring
Change-Id: I6b26a29c74be8833e7262eb59d266e6cce66f0c3
diff --git a/rw_core/core/adapter/agent.go b/rw_core/core/adapter/agent.go
index d030e72..e7c3d2b 100644
--- a/rw_core/core/adapter/agent.go
+++ b/rw_core/core/adapter/agent.go
@@ -25,8 +25,6 @@
vgrpc "github.com/opencord/voltha-lib-go/v7/pkg/grpc"
"github.com/opencord/voltha-lib-go/v7/pkg/log"
"github.com/opencord/voltha-protos/v5/go/adapter_service"
- "github.com/opencord/voltha-protos/v5/go/common"
- "github.com/opencord/voltha-protos/v5/go/health"
"github.com/opencord/voltha-protos/v5/go/voltha"
"google.golang.org/grpc"
)
@@ -43,13 +41,11 @@
coreEndpoint string
}
-func setAndTestAdapterServiceHandler(ctx context.Context, conn *grpc.ClientConn, clientConn *common.Connection) interface{} {
- svc := adapter_service.NewAdapterServiceClient(conn)
- if h, err := svc.GetHealthStatus(ctx, clientConn); err != nil || h.State != health.HealthStatus_HEALTHY {
- logger.Debugw(ctx, "remote-connection-not-ready", log.Fields{"error": err, "health": h, "requester": clientConn, "target": conn.Target()})
+func getAdapterServiceClientHandler(ctx context.Context, conn *grpc.ClientConn) interface{} {
+ if conn == nil {
return nil
}
- return svc
+ return adapter_service.NewAdapterServiceClient(conn)
}
func newAdapterAgent(coreEndpoint string, adapter *voltha.Adapter, onAdapterRestart vgrpc.RestartedHandler, liveProbeInterval time.Duration) *agent {
@@ -68,6 +64,7 @@
if aa.vClient, err = vgrpc.NewClient(
aa.coreEndpoint,
aa.adapterAPIEndPoint,
+ "adapter_service.AdapterService",
aa.onAdapterRestart); err != nil {
return err
}
@@ -75,12 +72,13 @@
// Add a liveness communication update
aa.vClient.SubscribeForLiveness(aa.updateCommunicationTime)
- go aa.vClient.Start(ctx, setAndTestAdapterServiceHandler)
+ go aa.vClient.Start(ctx, getAdapterServiceClientHandler)
return nil
}
func (aa *agent) stop(ctx context.Context) {
// Close the client
+ logger.Infow(ctx, "stopping-adapter-agent", log.Fields{"adapter": aa.adapter})
if aa.vClient != nil {
aa.vClient.Stop(ctx)
}
diff --git a/rw_core/core/adapter/manager.go b/rw_core/core/adapter/manager.go
index 258ff2a..f5a6cac 100644
--- a/rw_core/core/adapter/manager.go
+++ b/rw_core/core/adapter/manager.go
@@ -98,9 +98,18 @@
// Stop all adapters
aMgr.lockAdapterAgentsMap.RLock()
defer aMgr.lockAdapterAgentsMap.RUnlock()
+ var wg sync.WaitGroup
for _, adapterAgent := range aMgr.adapterAgents {
- adapterAgent.stop(ctx)
+ // Run the agent stop in its own go routine to notify to the
+ // adapters that the Core is no longer a client
+ wg.Add(1)
+ go func(agt *agent) {
+ agt.stop(ctx)
+ wg.Done()
+ }(adapterAgent)
}
+ // Wait for all tests to complete
+ wg.Wait()
}
func (aMgr *Manager) GetAdapterEndpoint(ctx context.Context, deviceID string, deviceType string) (string, error) {
diff --git a/rw_core/core/core.go b/rw_core/core/core.go
index 83282e3..c6ce503 100644
--- a/rw_core/core/core.go
+++ b/rw_core/core/core.go
@@ -42,6 +42,7 @@
Shutdown context.CancelFunc
Stopped chan struct{}
KafkaClient kafka.Client
+ adapterMgr *adapter.Manager
}
const (
@@ -90,13 +91,6 @@
func (core *Core) Start(ctx context.Context, id string, cf *config.RWCoreFlags) {
logger.Info(ctx, "starting-core-services", log.Fields{"coreId": id})
- // deferred functions are used to run cleanup
- // failing partway will stop anything that's been started
- defer close(core.Stopped)
- defer core.Shutdown()
-
- logger.Info(ctx, "starting-rw-core-components")
-
// setup kv client
logger.Debugw(ctx, "create-kv-client", log.Fields{"kvstore": cf.KVStoreType})
kvClient, err := newKVClient(ctx, cf.KVStoreType, cf.KVStoreAddress, cf.KVStoreTimeout)
@@ -144,6 +138,10 @@
adapterMgr := adapter.NewAdapterManager(cf.GrpcSBIAddress, dbPath, id, backend, cf.LiveProbeInterval)
adapterMgr.Start(ctx, adapterService)
+ // We do not do a defer adapterMgr.Stop() here as we want this to be ran as soon as
+ // the core is stopped
+ core.adapterMgr = adapterMgr
+
// create the core of the system, the device managers
deviceMgr, logicalDeviceMgr := device.NewManagers(dbPath, adapterMgr, cf, id, eventProxy)
@@ -153,6 +151,7 @@
if err != nil {
logger.Fatalw(ctx, "failure-starting-device-manager", log.Fields{"error": err})
}
+ defer deviceMgr.Stop(ctx, deviceService)
// Start the logical device manager to load the logical devices.
logicalDeviceMgr.Start(ctx, logicalDeviceService)
@@ -183,9 +182,11 @@
}
// Stop brings down core services
-func (core *Core) Stop() {
+func (core *Core) Stop(ctx context.Context) {
+ // Close all the grpc clients connections to the adapters first
+ core.adapterMgr.Stop(ctx)
core.Shutdown()
- <-core.Stopped
+ close(core.Stopped)
}
// startGrpcSbiService creates the grpc core service handlers, registers it to the grpc server and starts the server
diff --git a/rw_core/core/device/manager.go b/rw_core/core/device/manager.go
index 7d6e9da..07e92d0 100755
--- a/rw_core/core/device/manager.go
+++ b/rw_core/core/device/manager.go
@@ -58,6 +58,7 @@
devicesLoadingLock sync.RWMutex
deviceLoadingInProgress map[string][]chan int
config *config.RWCoreFlags
+ doneCh chan struct{}
}
//NewManagers creates the Manager and the Logical Manager.
@@ -74,6 +75,7 @@
Agent: event.NewAgent(eventProxy, coreInstanceID, cf.VolthaStackID),
deviceLoadingInProgress: make(map[string][]chan int),
config: cf,
+ doneCh: make(chan struct{}),
}
deviceMgr.stateTransitions = state.NewTransitionMap(deviceMgr)
@@ -128,6 +130,12 @@
return nil
}
+func (dMgr *Manager) Stop(ctx context.Context, serviceName string) {
+ logger.Info(ctx, "stopping-device-manager")
+ close(dMgr.doneCh)
+ probe.UpdateStatusFromContext(ctx, serviceName, probe.ServiceStatusStopped)
+}
+
func (dMgr *Manager) addDeviceAgentToMap(agent *Agent) {
if _, exist := dMgr.deviceAgents.Load(agent.deviceID); !exist {
dMgr.deviceAgents.Store(agent.deviceID, agent)
diff --git a/rw_core/core/device/manager_sbi.go b/rw_core/core/device/manager_sbi.go
index dcf6599..7a17d6d 100644
--- a/rw_core/core/device/manager_sbi.go
+++ b/rw_core/core/device/manager_sbi.go
@@ -17,23 +17,21 @@
import (
"context"
+ "fmt"
+ "time"
"github.com/golang/protobuf/ptypes/empty"
"github.com/opencord/voltha-go/rw_core/utils"
"github.com/opencord/voltha-lib-go/v7/pkg/log"
"github.com/opencord/voltha-protos/v5/go/common"
ca "github.com/opencord/voltha-protos/v5/go/core_adapter"
+ "github.com/opencord/voltha-protos/v5/go/core_service"
"github.com/opencord/voltha-protos/v5/go/health"
"github.com/opencord/voltha-protos/v5/go/voltha"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
-func (dMgr *Manager) GetHealthStatus(ctx context.Context, clientConn *common.Connection) (*health.HealthStatus, error) {
- logger.Debugw(ctx, "get-health-status-from-remote", log.Fields{"remote-client": clientConn})
- return &health.HealthStatus{State: health.HealthStatus_HEALTHY}, nil
-}
-
func (dMgr *Manager) PortCreated(ctx context.Context, port *voltha.Port) (*empty.Empty, error) {
ctx = utils.WithNewSpanAndRPCMetadataContext(ctx, "PortCreated")
@@ -533,3 +531,45 @@
}
return &empty.Empty{}, nil
}
+
+func (dMgr *Manager) GetHealthStatus(stream core_service.CoreService_GetHealthStatusServer) error {
+ ctx := utils.WithRPCMetadataContext(context.Background(), "keep-alive-connection")
+ logger.Debugw(ctx, "receive-stream-connection", log.Fields{"stream": stream})
+
+ if stream == nil {
+ return fmt.Errorf("conn-is-nil %v", stream)
+ }
+ initialRequestTime := time.Now()
+ var remoteClient *common.Connection
+ var tempClient *common.Connection
+ var err error
+loop:
+ for {
+ tempClient, err = stream.Recv()
+ if err != nil {
+ logger.Warnw(ctx, "received-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+ break loop
+ }
+ // Send a response back
+ err = stream.Send(&health.HealthStatus{State: health.HealthStatus_HEALTHY})
+ if err != nil {
+ logger.Warnw(ctx, "sending-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+ break loop
+ }
+
+ remoteClient = tempClient
+ logger.Debugw(ctx, "received-keep-alive", log.Fields{"remote-client": remoteClient})
+
+ select {
+ case <-stream.Context().Done():
+ logger.Infow(ctx, "stream-keep-alive-context-done", log.Fields{"remote-client": remoteClient, "error": stream.Context().Err()})
+ break loop
+ case <-dMgr.doneCh:
+ logger.Warnw(ctx, "received-stop", log.Fields{"remote-client": remoteClient, "initial-conn-time": initialRequestTime})
+ break loop
+ default:
+ }
+ }
+ logger.Errorw(ctx, "connection-down", log.Fields{"remote-client": remoteClient, "error": err, "initial-conn-time": initialRequestTime})
+ return err
+}
diff --git a/rw_core/main.go b/rw_core/main.go
index e8bc6ab..8a9675d 100644
--- a/rw_core/main.go
+++ b/rw_core/main.go
@@ -137,7 +137,7 @@
logger.Infow(ctx, "received-a-closing-signal", log.Fields{"code": code})
// Cleanup before leaving
- core.Stop()
+ core.Stop(shutdownCtx)
elapsed := time.Since(start)
logger.Infow(ctx, "rw-core-run-time", log.Fields{"core": instanceID, "time": elapsed / time.Second})
diff --git a/rw_core/mocks/adapter.go b/rw_core/mocks/adapter.go
index b99ca52..5a9130e 100644
--- a/rw_core/mocks/adapter.go
+++ b/rw_core/mocks/adapter.go
@@ -169,12 +169,11 @@
ta.Probe.UpdateStatus(ctx, serviceName, probe.ServiceStatusStopped)
}
-func setAndTestCoreServiceHandler(ctx context.Context, conn *grpc.ClientConn, clientConn *common.Connection) interface{} {
- svc := core_service.NewCoreServiceClient(conn)
- if h, err := svc.GetHealthStatus(ctx, clientConn); err != nil || h.State != health.HealthStatus_HEALTHY {
+func setCoreServiceHandler(ctx context.Context, conn *grpc.ClientConn) interface{} {
+ if conn == nil {
return nil
}
- return svc
+ return core_service.NewCoreServiceClient(conn)
}
// gRPC service
diff --git a/rw_core/mocks/adapter_olt.go b/rw_core/mocks/adapter_olt.go
index 96dfb33..0e8c58e 100644
--- a/rw_core/mocks/adapter_olt.go
+++ b/rw_core/mocks/adapter_olt.go
@@ -22,11 +22,14 @@
"fmt"
"strconv"
"strings"
+ "time"
"github.com/golang/protobuf/ptypes/empty"
"github.com/opencord/voltha-lib-go/v7/pkg/probe"
+ "github.com/opencord/voltha-protos/v5/go/adapter_service"
"github.com/opencord/voltha-protos/v5/go/common"
"github.com/opencord/voltha-protos/v5/go/extension"
+ "github.com/opencord/voltha-protos/v5/go/health"
"github.com/opencord/voltha-protos/v5/go/omci"
"github.com/phayes/freeport"
@@ -92,13 +95,14 @@
// Establish grpc connection to Core
if oltA.coreClient, err = vgrpc.NewClient(
- "olt-endpoint",
+ "mock-olt-endpoint",
oltA.coreEnpoint,
+ "core_service.CoreService",
oltA.oltRestarted); err != nil {
logger.Fatal(ctx, "grpc-client-not-created")
}
- go oltA.coreClient.Start(probeCtx, setAndTestCoreServiceHandler)
+ go oltA.coreClient.Start(probeCtx, setCoreServiceHandler)
logger.Debugw(ctx, "OLTAdapter-started", log.Fields{"grpc-address": oltA.serviceEndpoint})
@@ -431,3 +435,40 @@
}
}
+
+func (oltA *OLTAdapter) GetHealthStatus(stream adapter_service.AdapterService_GetHealthStatusServer) error {
+ ctx := context.Background()
+ logger.Debugw(ctx, "receive-stream-connection", log.Fields{"stream": stream})
+
+ if stream == nil {
+ return fmt.Errorf("conn-is-nil %v", stream)
+ }
+ initialRequestTime := time.Now()
+ var remoteClient *common.Connection
+ var tempClient *common.Connection
+ var err error
+loop:
+ for {
+ tempClient, err = stream.Recv()
+ if err != nil {
+ logger.Warnw(ctx, "received-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+ break loop
+ }
+ err = stream.Send(&health.HealthStatus{State: health.HealthStatus_HEALTHY})
+ if err != nil {
+ logger.Warnw(ctx, "sending-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+ break loop
+ }
+ remoteClient = tempClient
+ logger.Debugw(ctx, "received-keep-alive", log.Fields{"remote-client": remoteClient})
+
+ select {
+ case <-stream.Context().Done():
+ logger.Infow(ctx, "stream-keep-alive-context-done", log.Fields{"remote-client": remoteClient, "error": stream.Context().Err()})
+ break loop
+ default:
+ }
+ }
+ logger.Errorw(ctx, "connection-down", log.Fields{"remote-client": remoteClient, "error": err, "initial-conn-time": initialRequestTime})
+ return err
+}
diff --git a/rw_core/mocks/adapter_onu.go b/rw_core/mocks/adapter_onu.go
index d5669c3..dcfc60c 100644
--- a/rw_core/mocks/adapter_onu.go
+++ b/rw_core/mocks/adapter_onu.go
@@ -21,13 +21,16 @@
"fmt"
"strconv"
"strings"
+ "time"
"github.com/golang/protobuf/ptypes/empty"
vgrpc "github.com/opencord/voltha-lib-go/v7/pkg/grpc"
"github.com/opencord/voltha-lib-go/v7/pkg/probe"
+ "github.com/opencord/voltha-protos/v5/go/adapter_service"
"github.com/opencord/voltha-protos/v5/go/common"
ca "github.com/opencord/voltha-protos/v5/go/core_adapter"
"github.com/opencord/voltha-protos/v5/go/extension"
+ "github.com/opencord/voltha-protos/v5/go/health"
"github.com/phayes/freeport"
"github.com/gogo/protobuf/proto"
@@ -87,12 +90,13 @@
// Establish grpc connection to Core
if onuA.coreClient, err = vgrpc.NewClient(
- "onu-endpoint",
+ "mock-onu-endpoint",
onuA.coreEnpoint,
+ "core_service.CoreService",
onuA.onuRestarted); err != nil {
logger.Fatal(ctx, "grpc-client-not-created")
}
- go onuA.coreClient.Start(probeCtx, setAndTestCoreServiceHandler)
+ go onuA.coreClient.Start(probeCtx, setCoreServiceHandler)
logger.Debugw(ctx, "ONUAdapter-started", log.Fields{"grpc-address": onuA.serviceEndpoint})
}
@@ -297,3 +301,40 @@
func (onuA *ONUAdapter) StartOmciTest(ctx context.Context, _ *ca.OMCITest) (*omci.TestResponse, error) { // nolint
return &omci.TestResponse{Result: omci.TestResponse_SUCCESS}, nil
}
+
+func (onuA *ONUAdapter) GetHealthStatus(stream adapter_service.AdapterService_GetHealthStatusServer) error {
+ ctx := context.Background()
+ logger.Debugw(ctx, "receive-stream-connection", log.Fields{"stream": stream})
+
+ if stream == nil {
+ return fmt.Errorf("conn-is-nil %v", stream)
+ }
+ initialRequestTime := time.Now()
+ var remoteClient *common.Connection
+ var tempClient *common.Connection
+ var err error
+loop:
+ for {
+ tempClient, err = stream.Recv()
+ if err != nil {
+ logger.Warnw(ctx, "received-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+ break loop
+ }
+ err = stream.Send(&health.HealthStatus{State: health.HealthStatus_HEALTHY})
+ if err != nil {
+ logger.Warnw(ctx, "sending-stream-error", log.Fields{"remote-client": remoteClient, "error": err})
+ break loop
+ }
+ remoteClient = tempClient
+ logger.Debugw(ctx, "received-keep-alive", log.Fields{"remote-client": remoteClient})
+
+ select {
+ case <-stream.Context().Done():
+ logger.Infow(ctx, "stream-keep-alive-context-done", log.Fields{"remote-client": remoteClient, "error": stream.Context().Err()})
+ break loop
+ default:
+ }
+ }
+ logger.Errorw(ctx, "connection-down", log.Fields{"remote-client": remoteClient, "error": err, "initial-conn-time": initialRequestTime})
+ return err
+}
diff --git a/rw_core/test/core_nbi_handler_multi_test.go b/rw_core/test/core_nbi_handler_multi_test.go
index 2856744..6f5a37e 100755
--- a/rw_core/test/core_nbi_handler_multi_test.go
+++ b/rw_core/test/core_nbi_handler_multi_test.go
@@ -263,7 +263,7 @@
}
}
if nb.core != nil {
- nb.core.Stop()
+ nb.core.Stop(ctx)
}
}