enabling grpc retry
Change-Id: I3e83654980386cca080f5ff5f2168e9b405fee1f
diff --git a/rw_core/config/config.go b/rw_core/config/config.go
index c928a84..1fe1940 100644
--- a/rw_core/config/config.go
+++ b/rw_core/config/config.go
@@ -60,6 +60,8 @@
BackoffRetryInitialInterval time.Duration
BackoffRetryMaxElapsedTime time.Duration
BackoffRetryMaxInterval time.Duration
+ PerRPCRetryTimeout time.Duration
+ MaxRetries uint
}
// ParseCommandArguments parses the arguments when running read-write core service
@@ -201,6 +203,13 @@
"backoff_retry_max_interval",
1*time.Minute,
"The maximum number of milliseconds of an exponential backoff interval")
-
+ fs.DurationVar(&cf.PerRPCRetryTimeout,
+ "per_rpc_retry_timeout",
+ 0*time.Second,
+ "The default timeout per RPC retry")
+ fs.UintVar(&cf.MaxRetries,
+ "max_grpc_client_retry",
+ 0,
+ "The maximum number of times olt adaptor will retry in case grpc request timeouts")
_ = fs.Parse(args)
}
diff --git a/rw_core/core/adapter/agent.go b/rw_core/core/adapter/agent.go
index 973f767..c25c793 100644
--- a/rw_core/core/adapter/agent.go
+++ b/rw_core/core/adapter/agent.go
@@ -19,14 +19,15 @@
import (
"context"
"errors"
- "sync"
- "time"
-
+ grpc_retry "github.com/grpc-ecosystem/go-grpc-middleware/retry"
vgrpc "github.com/opencord/voltha-lib-go/v7/pkg/grpc"
"github.com/opencord/voltha-lib-go/v7/pkg/log"
"github.com/opencord/voltha-protos/v5/go/adapter_service"
"github.com/opencord/voltha-protos/v5/go/voltha"
"google.golang.org/grpc"
+ codes "google.golang.org/grpc/codes"
+ "sync"
+ "time"
)
// agent represents adapter agent
@@ -39,6 +40,8 @@
onAdapterRestart vgrpc.RestartedHandler
liveProbeInterval time.Duration
coreEndpoint string
+ maxRetries uint
+ perRPCRetryTimeout time.Duration
}
func getAdapterServiceClientHandler(ctx context.Context, conn *grpc.ClientConn) interface{} {
@@ -48,13 +51,15 @@
return adapter_service.NewAdapterServiceClient(conn)
}
-func newAdapterAgent(coreEndpoint string, adapter *voltha.Adapter, onAdapterRestart vgrpc.RestartedHandler, liveProbeInterval time.Duration) *agent {
+func newAdapterAgent(coreEndpoint string, adapter *voltha.Adapter, onAdapterRestart vgrpc.RestartedHandler, liveProbeInterval time.Duration, maxRetries uint, perRPCRetryTimeout time.Duration) *agent {
return &agent{
adapter: adapter,
onAdapterRestart: onAdapterRestart,
adapterAPIEndPoint: adapter.Endpoint,
liveProbeInterval: liveProbeInterval,
coreEndpoint: coreEndpoint,
+ maxRetries: maxRetries,
+ perRPCRetryTimeout: perRPCRetryTimeout,
}
}
@@ -71,8 +76,15 @@
// Add a liveness communication update
aa.vClient.SubscribeForLiveness(aa.updateCommunicationTime)
-
- go aa.vClient.Start(ctx, getAdapterServiceClientHandler)
+ // the backoff function sets the wait time bw each grpc retries, if not set it will take the deafault value of 50ms which is too low, the jitter sets the rpc retry wait time to be in a range of[PerRPCRetryTimeout-0.2, PerRPCRetryTimeout+0.2]
+ backoffCtxOption := grpc_retry.WithBackoff(grpc_retry.BackoffLinearWithJitter(aa.perRPCRetryTimeout, 0.2))
+ retryCodes := []codes.Code{
+ codes.Unavailable, // server is currently unavailable
+ codes.DeadlineExceeded, // deadline for the operation was exceeded
+ }
+ grpcRetryOptions := grpc_retry.UnaryClientInterceptor(grpc_retry.WithMax(aa.maxRetries), grpc_retry.WithPerRetryTimeout(aa.perRPCRetryTimeout), grpc_retry.WithCodes(retryCodes...), backoffCtxOption)
+ logger.Debug(ctx, "Configuration values", log.Fields{"RETRY": aa.maxRetries, "TIMEOUT": aa.perRPCRetryTimeout})
+ go aa.vClient.Start(ctx, getAdapterServiceClientHandler, grpcRetryOptions)
return nil
}
diff --git a/rw_core/core/adapter/manager.go b/rw_core/core/adapter/manager.go
index 1ab4341..d73da79 100644
--- a/rw_core/core/adapter/manager.go
+++ b/rw_core/core/adapter/manager.go
@@ -52,6 +52,8 @@
lockDeviceTypesMap sync.RWMutex
lockAdapterEndPointsMap sync.RWMutex
liveProbeInterval time.Duration
+ PerRPCRetryTimeout time.Duration
+ MaxRetries uint
coreEndpoint string
rollingUpdateMap map[string]bool
rollingUpdateLock sync.RWMutex
@@ -70,6 +72,8 @@
coreInstanceID string,
backend *db.Backend,
liveProbeInterval time.Duration,
+ maxRetries uint,
+ perRPCRetryTimeout time.Duration,
) *Manager {
return &Manager{
adapterDbProxy: dbPath.Proxy("adapters"),
@@ -80,6 +84,8 @@
endpointMgr: NewEndpointManager(backend),
liveProbeInterval: liveProbeInterval,
coreEndpoint: coreEndpoint,
+ MaxRetries: maxRetries,
+ PerRPCRetryTimeout: perRPCRetryTimeout,
rollingUpdateMap: make(map[string]bool),
rxStreamCloseChMap: make(map[string]chan bool),
}
@@ -196,7 +202,7 @@
// Use a muted adapter restart handler which is invoked by the corresponding gRPC client on an adapter restart.
// This handler just log the restart event. The actual action taken following an adapter restart
// will be done when an adapter re-registers itself.
- aMgr.adapterAgents[adapter.Id] = newAdapterAgent(aMgr.coreEndpoint, clonedAdapter, aMgr.mutedAdapterRestartedHandler, aMgr.liveProbeInterval)
+ aMgr.adapterAgents[adapter.Id] = newAdapterAgent(aMgr.coreEndpoint, clonedAdapter, aMgr.mutedAdapterRestartedHandler, aMgr.liveProbeInterval, aMgr.MaxRetries, aMgr.PerRPCRetryTimeout)
aMgr.adapterEndpoints[Endpoint(adapter.Endpoint)] = aMgr.adapterAgents[adapter.Id]
}
return nil
@@ -229,7 +235,7 @@
// Use a muted adapter restart handler which is invoked by the corresponding gRPC client on an adapter restart.
// This handler just log the restart event. The actual action taken following an adapter restart
// will be done when an adapter re-registers itself.
- aMgr.adapterAgents[adapter.Id] = newAdapterAgent(aMgr.coreEndpoint, clonedAdapter, aMgr.mutedAdapterRestartedHandler, aMgr.liveProbeInterval)
+ aMgr.adapterAgents[adapter.Id] = newAdapterAgent(aMgr.coreEndpoint, clonedAdapter, aMgr.mutedAdapterRestartedHandler, aMgr.liveProbeInterval, aMgr.MaxRetries, aMgr.PerRPCRetryTimeout)
aMgr.adapterEndpoints[Endpoint(adapter.Endpoint)] = aMgr.adapterAgents[adapter.Id]
return nil
}
diff --git a/rw_core/core/core.go b/rw_core/core/core.go
index 0df3b4e..c4eb6e7 100644
--- a/rw_core/core/core.go
+++ b/rw_core/core/core.go
@@ -143,7 +143,7 @@
dbPath := model.NewDBPath(backend)
// load adapters & device types while other things are starting
- adapterMgr := adapter.NewAdapterManager(cf.GrpcSBIAddress, dbPath, id, backend, cf.LiveProbeInterval)
+ adapterMgr := adapter.NewAdapterManager(cf.GrpcSBIAddress, dbPath, id, backend, cf.LiveProbeInterval, cf.MaxRetries, cf.PerRPCRetryTimeout)
adapterMgr.Start(ctx, adapterService)
// We do not do a defer adapterMgr.Stop() here as we want this to be ran as soon as
diff --git a/rw_core/core/device/agent_test.go b/rw_core/core/device/agent_test.go
index 076438c..49c4d6b 100755
--- a/rw_core/core/device/agent_test.go
+++ b/rw_core/core/device/agent_test.go
@@ -131,7 +131,7 @@
LivenessChannelInterval: cfg.LiveProbeInterval / 2}
proxy := model.NewDBPath(backend)
- dat.adapterMgr = adapter.NewAdapterManager("test-endpoint", proxy, dat.coreInstanceID, backend, 5)
+ dat.adapterMgr = adapter.NewAdapterManager("test-endpoint", proxy, dat.coreInstanceID, backend, 5, 6, 5)
eventProxy := events.NewEventProxy(events.MsgClient(dat.kEventClient), events.MsgTopic(kafka.Topic{Name: cfg.EventTopic}))
dat.deviceMgr, dat.logicalDeviceMgr = NewManagers(proxy, dat.adapterMgr, cfg, dat.coreInstanceID, eventProxy)
dat.adapterMgr.Start(context.Background(), "agent-test")
diff --git a/rw_core/core/device/logical_agent_test.go b/rw_core/core/device/logical_agent_test.go
index 8c9c8ee..3c626c6 100644
--- a/rw_core/core/device/logical_agent_test.go
+++ b/rw_core/core/device/logical_agent_test.go
@@ -154,7 +154,7 @@
LivenessChannelInterval: cfg.LiveProbeInterval / 2}
proxy := model.NewDBPath(backend)
- adapterMgr := adapter.NewAdapterManager("test-endpoint", proxy, lda.coreInstanceID, backend, 5)
+ adapterMgr := adapter.NewAdapterManager("test-endpoint", proxy, lda.coreInstanceID, backend, 5, 6, 5)
eventProxy := events.NewEventProxy(events.MsgClient(lda.kEventClient), events.MsgTopic(kafka.Topic{Name: cfg.EventTopic}))
lda.deviceMgr, lda.logicalDeviceMgr = NewManagers(proxy, adapterMgr, cfg, lda.coreInstanceID, eventProxy)
adapterMgr.Start(context.Background(), "logical-test")