enabling grpc retry Change-Id: I3e83654980386cca080f5ff5f2168e9b405fee1f

commit: 0ded28dd4ba2bdab3e4786499d00ca1f710fc00e [log] [tgz]
author: nikesh.krishnan <nikesh.krishnan@radisys.com> Wed Jun 28 12:36:32 2023 +0530
committer: Nikesh Krishnan <nikesh.krishnan@radisys.com> Wed Jun 28 10:48:51 2023 +0000
tree: f3bdf13bc9d6087c8a00786bfbdc4f69fb19bd04
parent: c45a200f779d8c68ff378a237e77c0a18761d2f3 [diff]
diff --git a/rw_core/config/config.go b/rw_core/config/config.go
index c928a84..1fe1940 100644
--- a/rw_core/config/config.go
+++ b/rw_core/config/config.go

@@ -60,6 +60,8 @@
 	BackoffRetryInitialInterval time.Duration
 	BackoffRetryMaxElapsedTime  time.Duration
 	BackoffRetryMaxInterval     time.Duration
+	PerRPCRetryTimeout          time.Duration
+	MaxRetries                  uint
 }
 
 // ParseCommandArguments parses the arguments when running read-write core service
@@ -201,6 +203,13 @@
 		"backoff_retry_max_interval",
 		1*time.Minute,
 		"The maximum number of milliseconds of an exponential backoff interval")
-
+	fs.DurationVar(&cf.PerRPCRetryTimeout,
+		"per_rpc_retry_timeout",
+		0*time.Second,
+		"The default timeout per RPC retry")
+	fs.UintVar(&cf.MaxRetries,
+		"max_grpc_client_retry",
+		0,
+		"The maximum number of times olt adaptor will retry in case grpc request timeouts")
 	_ = fs.Parse(args)
 }

diff --git a/rw_core/core/adapter/agent.go b/rw_core/core/adapter/agent.go
index 973f767..c25c793 100644
--- a/rw_core/core/adapter/agent.go
+++ b/rw_core/core/adapter/agent.go

@@ -19,14 +19,15 @@
 import (
 	"context"
 	"errors"
-	"sync"
-	"time"
-
+	grpc_retry "github.com/grpc-ecosystem/go-grpc-middleware/retry"
 	vgrpc "github.com/opencord/voltha-lib-go/v7/pkg/grpc"
 	"github.com/opencord/voltha-lib-go/v7/pkg/log"
 	"github.com/opencord/voltha-protos/v5/go/adapter_service"
 	"github.com/opencord/voltha-protos/v5/go/voltha"
 	"google.golang.org/grpc"
+	codes "google.golang.org/grpc/codes"
+	"sync"
+	"time"
 )
 
 // agent represents adapter agent
@@ -39,6 +40,8 @@
 	onAdapterRestart   vgrpc.RestartedHandler
 	liveProbeInterval  time.Duration
 	coreEndpoint       string
+	maxRetries         uint
+	perRPCRetryTimeout time.Duration
 }
 
 func getAdapterServiceClientHandler(ctx context.Context, conn *grpc.ClientConn) interface{} {
@@ -48,13 +51,15 @@
 	return adapter_service.NewAdapterServiceClient(conn)
 }
 
-func newAdapterAgent(coreEndpoint string, adapter *voltha.Adapter, onAdapterRestart vgrpc.RestartedHandler, liveProbeInterval time.Duration) *agent {
+func newAdapterAgent(coreEndpoint string, adapter *voltha.Adapter, onAdapterRestart vgrpc.RestartedHandler, liveProbeInterval time.Duration, maxRetries uint, perRPCRetryTimeout time.Duration) *agent {
 	return &agent{
 		adapter:            adapter,
 		onAdapterRestart:   onAdapterRestart,
 		adapterAPIEndPoint: adapter.Endpoint,
 		liveProbeInterval:  liveProbeInterval,
 		coreEndpoint:       coreEndpoint,
+		maxRetries:         maxRetries,
+		perRPCRetryTimeout: perRPCRetryTimeout,
 	}
 }
 
@@ -71,8 +76,15 @@
 
 	// Add a liveness communication update
 	aa.vClient.SubscribeForLiveness(aa.updateCommunicationTime)
-
-	go aa.vClient.Start(ctx, getAdapterServiceClientHandler)
+	// the backoff function sets the wait time bw each grpc retries, if not set it will take the deafault value of 50ms which is too low, the jitter sets the rpc retry wait time to be in a range of[PerRPCRetryTimeout-0.2, PerRPCRetryTimeout+0.2]
+	backoffCtxOption := grpc_retry.WithBackoff(grpc_retry.BackoffLinearWithJitter(aa.perRPCRetryTimeout, 0.2))
+	retryCodes := []codes.Code{
+		codes.Unavailable,      // server is currently unavailable
+		codes.DeadlineExceeded, // deadline for the operation was exceeded
+	}
+	grpcRetryOptions := grpc_retry.UnaryClientInterceptor(grpc_retry.WithMax(aa.maxRetries), grpc_retry.WithPerRetryTimeout(aa.perRPCRetryTimeout), grpc_retry.WithCodes(retryCodes...), backoffCtxOption)
+	logger.Debug(ctx, "Configuration values", log.Fields{"RETRY": aa.maxRetries, "TIMEOUT": aa.perRPCRetryTimeout})
+	go aa.vClient.Start(ctx, getAdapterServiceClientHandler, grpcRetryOptions)
 	return nil
 }
 

diff --git a/rw_core/core/adapter/manager.go b/rw_core/core/adapter/manager.go
index 1ab4341..d73da79 100644
--- a/rw_core/core/adapter/manager.go
+++ b/rw_core/core/adapter/manager.go

@@ -52,6 +52,8 @@
 	lockDeviceTypesMap      sync.RWMutex
 	lockAdapterEndPointsMap sync.RWMutex
 	liveProbeInterval       time.Duration
+	PerRPCRetryTimeout      time.Duration
+	MaxRetries              uint
 	coreEndpoint            string
 	rollingUpdateMap        map[string]bool
 	rollingUpdateLock       sync.RWMutex
@@ -70,6 +72,8 @@
 	coreInstanceID string,
 	backend *db.Backend,
 	liveProbeInterval time.Duration,
+	maxRetries uint,
+	perRPCRetryTimeout time.Duration,
 ) *Manager {
 	return &Manager{
 		adapterDbProxy:     dbPath.Proxy("adapters"),
@@ -80,6 +84,8 @@
 		endpointMgr:        NewEndpointManager(backend),
 		liveProbeInterval:  liveProbeInterval,
 		coreEndpoint:       coreEndpoint,
+		MaxRetries:         maxRetries,
+		PerRPCRetryTimeout: perRPCRetryTimeout,
 		rollingUpdateMap:   make(map[string]bool),
 		rxStreamCloseChMap: make(map[string]chan bool),
 	}
@@ -196,7 +202,7 @@
 		// Use a muted adapter restart handler which is invoked by the corresponding gRPC client on an adapter restart.
 		// This handler just log the restart event.  The actual action taken following an adapter restart
 		// will be done when an adapter re-registers itself.
-		aMgr.adapterAgents[adapter.Id] = newAdapterAgent(aMgr.coreEndpoint, clonedAdapter, aMgr.mutedAdapterRestartedHandler, aMgr.liveProbeInterval)
+		aMgr.adapterAgents[adapter.Id] = newAdapterAgent(aMgr.coreEndpoint, clonedAdapter, aMgr.mutedAdapterRestartedHandler, aMgr.liveProbeInterval, aMgr.MaxRetries, aMgr.PerRPCRetryTimeout)
 		aMgr.adapterEndpoints[Endpoint(adapter.Endpoint)] = aMgr.adapterAgents[adapter.Id]
 	}
 	return nil
@@ -229,7 +235,7 @@
 	// Use a muted adapter restart handler which is invoked by the corresponding gRPC client on an adapter restart.
 	// This handler just log the restart event.  The actual action taken following an adapter restart
 	// will be done when an adapter re-registers itself.
-	aMgr.adapterAgents[adapter.Id] = newAdapterAgent(aMgr.coreEndpoint, clonedAdapter, aMgr.mutedAdapterRestartedHandler, aMgr.liveProbeInterval)
+	aMgr.adapterAgents[adapter.Id] = newAdapterAgent(aMgr.coreEndpoint, clonedAdapter, aMgr.mutedAdapterRestartedHandler, aMgr.liveProbeInterval, aMgr.MaxRetries, aMgr.PerRPCRetryTimeout)
 	aMgr.adapterEndpoints[Endpoint(adapter.Endpoint)] = aMgr.adapterAgents[adapter.Id]
 	return nil
 }

diff --git a/rw_core/core/core.go b/rw_core/core/core.go
index 0df3b4e..c4eb6e7 100644
--- a/rw_core/core/core.go
+++ b/rw_core/core/core.go

@@ -143,7 +143,7 @@
 	dbPath := model.NewDBPath(backend)
 
 	// load adapters & device types while other things are starting
-	adapterMgr := adapter.NewAdapterManager(cf.GrpcSBIAddress, dbPath, id, backend, cf.LiveProbeInterval)
+	adapterMgr := adapter.NewAdapterManager(cf.GrpcSBIAddress, dbPath, id, backend, cf.LiveProbeInterval, cf.MaxRetries, cf.PerRPCRetryTimeout)
 	adapterMgr.Start(ctx, adapterService)
 
 	// We do not do a defer adapterMgr.Stop() here as we want this to be ran as soon as

diff --git a/rw_core/core/device/agent_test.go b/rw_core/core/device/agent_test.go
index 076438c..49c4d6b 100755
--- a/rw_core/core/device/agent_test.go
+++ b/rw_core/core/device/agent_test.go

@@ -131,7 +131,7 @@
 		LivenessChannelInterval: cfg.LiveProbeInterval / 2}
 
 	proxy := model.NewDBPath(backend)
-	dat.adapterMgr = adapter.NewAdapterManager("test-endpoint", proxy, dat.coreInstanceID, backend, 5)
+	dat.adapterMgr = adapter.NewAdapterManager("test-endpoint", proxy, dat.coreInstanceID, backend, 5, 6, 5)
 	eventProxy := events.NewEventProxy(events.MsgClient(dat.kEventClient), events.MsgTopic(kafka.Topic{Name: cfg.EventTopic}))
 	dat.deviceMgr, dat.logicalDeviceMgr = NewManagers(proxy, dat.adapterMgr, cfg, dat.coreInstanceID, eventProxy)
 	dat.adapterMgr.Start(context.Background(), "agent-test")

diff --git a/rw_core/core/device/logical_agent_test.go b/rw_core/core/device/logical_agent_test.go
index 8c9c8ee..3c626c6 100644
--- a/rw_core/core/device/logical_agent_test.go
+++ b/rw_core/core/device/logical_agent_test.go

@@ -154,7 +154,7 @@
 		LivenessChannelInterval: cfg.LiveProbeInterval / 2}
 
 	proxy := model.NewDBPath(backend)
-	adapterMgr := adapter.NewAdapterManager("test-endpoint", proxy, lda.coreInstanceID, backend, 5)
+	adapterMgr := adapter.NewAdapterManager("test-endpoint", proxy, lda.coreInstanceID, backend, 5, 6, 5)
 	eventProxy := events.NewEventProxy(events.MsgClient(lda.kEventClient), events.MsgTopic(kafka.Topic{Name: cfg.EventTopic}))
 	lda.deviceMgr, lda.logicalDeviceMgr = NewManagers(proxy, adapterMgr, cfg, lda.coreInstanceID, eventProxy)
 	adapterMgr.Start(context.Background(), "logical-test")
commit	0ded28dd4ba2bdab3e4786499d00ca1f710fc00e	[log] [tgz]
author	nikesh.krishnan <nikesh.krishnan@radisys.com>	Wed Jun 28 12:36:32 2023 +0530
committer	Nikesh Krishnan <nikesh.krishnan@radisys.com>	Wed Jun 28 10:48:51 2023 +0000
tree	f3bdf13bc9d6087c8a00786bfbdc4f69fb19bd04
parent	c45a200f779d8c68ff378a237e77c0a18761d2f3 [diff]