[VOL-5399] - Handle reconcile issues during upgrade scenario and other fixes in reconcile
Change-Id: I5b44b7a1847885c49cf95cf232be640e8d8e6094
Signed-off-by: Sridhar Ravindra <sridhar.ravindra@radisys.com>
diff --git a/VERSION b/VERSION
index e4a0356..26664c7 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-3.6.3-dev2
+3.6.3-dev3
diff --git a/rw_core/core/adapter/manager.go b/rw_core/core/adapter/manager.go
index 7a07108..88b755c 100644
--- a/rw_core/core/adapter/manager.go
+++ b/rw_core/core/adapter/manager.go
@@ -370,12 +370,11 @@
agt.resetConnection(ctx)
}
- go func() {
- err := aMgr.onAdapterRestart(log.WithSpanFromContext(context.Background(), ctx), adpt.Endpoint)
- if err != nil {
- logger.Errorw(ctx, "unable-to-restart-adapter", log.Fields{"error": err})
- }
- }()
+ err = aMgr.onAdapterRestart(log.WithSpanFromContext(context.Background(), ctx), adpt.Endpoint)
+ if err != nil {
+ logger.Errorw(ctx, "unable-to-restart-adapter", log.Fields{"error": err})
+ }
+
return &empty.Empty{}, nil
}
// Save the adapter and the device types
diff --git a/rw_core/core/device/agent.go b/rw_core/core/device/agent.go
index 49d359d..92a7e1c 100755
--- a/rw_core/core/device/agent.go
+++ b/rw_core/core/device/agent.go
@@ -1464,16 +1464,24 @@
return err
}
-func (agent *Agent) ReconcileDevice(ctx context.Context) {
- // Do not reconcile if the device was in DELETE_FAILED transient state. Just invoke the force delete on that device.
+func (agent *Agent) UpdateTransientStateToReconcile(ctx context.Context) error {
+ requestStatus := &common.OperationResp{Code: common.OperationResp_OPERATION_FAILURE}
+ var desc string
+
+ //set transient state to RECONCILE IN PROGRESS
+ err := agent.updateTransientState(ctx, core.DeviceTransientState_RECONCILE_IN_PROGRESS)
+ if err != nil {
+ logger.Errorw(ctx, "setting-transient-state-failed", log.Fields{"error": err})
+ agent.logDeviceUpdate(ctx, nil, nil, requestStatus, nil, desc)
+ return err
+ }
+ return nil
+}
+
+func (agent *Agent) StartReconcileWithRetry(ctx context.Context) {
+
state := agent.getTransientState()
logger.Debugw(ctx, "starting-reconcile", log.Fields{"device-id": agent.deviceID, "state": state})
- if agent.getTransientState() == core.DeviceTransientState_DELETE_FAILED {
- if err := agent.DeleteDevicePostAdapterRestart(ctx); err != nil {
- logger.Errorw(ctx, "delete-post-restart-failed", log.Fields{"error": err, "device-id": agent.deviceID})
- }
- return
- }
requestStatus := &common.OperationResp{Code: common.OperationResp_OPERATION_FAILURE}
var desc string
@@ -1498,15 +1506,6 @@
return
}
- //set transient state to RECONCILE IN PROGRESS
- err := agent.updateTransientState(ctx, core.DeviceTransientState_RECONCILE_IN_PROGRESS)
- if err != nil {
- agent.requestQueue.RequestComplete()
- logger.Errorw(ctx, "setting-transient-state-failed", log.Fields{"error": err})
- agent.logDeviceUpdate(ctx, nil, nil, requestStatus, nil, desc)
- return
- }
-
reconcilingBackoff := backoff.NewExponentialBackOff()
reconcilingBackoff.InitialInterval = agent.config.BackoffRetryInitialInterval
reconcilingBackoff.MaxElapsedTime = agent.config.BackoffRetryMaxElapsedTime
@@ -1514,12 +1513,20 @@
//making here to keep lifecycle of this channel within the scope of retryReconcile
agent.stopReconcilingMutex.Lock()
- agent.stopReconciling = make(chan int, 1)
+ if agent.stopReconciling != nil {
+ logger.Warnw(ctx, "Reconciling with retries is already in progress, don't proceed further", log.Fields{"device-id": device.Id})
+ agent.stopReconcilingMutex.Unlock()
+ agent.requestQueue.RequestComplete()
+ return
+ } else {
+ agent.stopReconciling = make(chan int, 1)
+ }
agent.stopReconcilingMutex.Unlock()
// defined outside the retry loop so it can be cleaned
// up when the loop breaks
var backoffTimer *time.Timer
+ isDeviceReconciledErr := false
retry:
for {
@@ -1537,6 +1544,19 @@
duration := reconcilingBackoff.NextBackOff()
//This case should never occur in default case as max elapsed time for backoff is 0(by default) , so it will never return stop
if duration == backoff.Stop {
+ // If we have received device reconciled error and the retry intervals have elapsed
+ // clean up the reconcile and break the retry loop
+ if isDeviceReconciledErr {
+ logger.Warnw(ctx, "reached max retry with device reconciled error", log.Fields{"max-reconciling-backoff": reconcilingBackoff.MaxElapsedTime,
+ "device-id": device.Id})
+ // Release lock before reconcile clean up
+ agent.requestQueue.RequestComplete()
+ err := agent.reconcilingCleanup(ctx)
+ if err != nil {
+ logger.Errorf(ctx, "Error during reconcile cleanup", err.Error())
+ }
+ break retry
+ }
// If we reach a maximum then warn and reset the backoff
// timer and keep attempting.
logger.Warnw(ctx, "maximum-reconciling-backoff-reached--resetting-backoff-timer",
@@ -1555,16 +1575,6 @@
// Send a reconcile request to the adapter.
err := agent.sendReconcileRequestToAdapter(ctx, device)
- // Check the transient state after a response from the adapter. If a device delete
- // request was issued due to a callback during that time and failed then just delete
- // the device and stop the reconcile loop and invoke the device deletion
- if agent.getTransientState() == core.DeviceTransientState_DELETE_FAILED {
- if dErr := agent.DeleteDevicePostAdapterRestart(ctx); dErr != nil {
- logger.Errorw(ctx, "delete-post-restart-failed", log.Fields{"error": dErr, "device-id": agent.deviceID})
- }
- break retry
- }
-
if errors.Is(err, errContextExpired) || errors.Is(err, errReconcileAborted) {
logger.Errorw(ctx, "reconcile-aborted", log.Fields{"error": err})
requestStatus = &common.OperationResp{Code: common.OperationResp_OperationReturnCode(common.OperStatus_FAILED)}
@@ -1576,27 +1586,52 @@
if ok {
// Decode the error code and error message
errorCode := st.Code()
+ // When the device is deleted and the core is still retrying the reconcile, adapter will send Not Found error to core when DeviceStateUpdate fails.
+ // This will enable core to stop the reconcile for the deleted device.
+ /*
+ As the grcp connection is now reused (on adapters restart), there could be two scenarios where we could land
+ in a stiuation where the reconcile is in a loop for already reconciled ONUs.
+ 1 If the reconcile lands on an older instance of the adapters we get this error "device already reconciled",
+ in which case core will retry. Once the new adapter instance comes up and old adapter instance is stopped, this will go through successful.
+ 2 If the core restarts during reconcile of device, the reconcile will be done by the adapter but before
+ the response reaches the core could have restarted. This leaves the tansient state of the device in reconciling.
+ Once the core comes up , the reconcile is fired again and as the device was previously reconciled adapter returns this error.
+ Either of the cases we retry for retryinterval and assume that the device is already reconciled in the new adapter.
+ */
if errorCode == codes.AlreadyExists {
logger.Warnw(ctx, "device already reconciled", log.Fields{"error": err})
- err := agent.reconcilingCleanup(ctx)
- if err != nil {
- logger.Errorf(ctx, "error during reconcile cleanup", err.Error())
+ // Reset the backoff current interval when we receive the device reconciled error for the first time
+ // so that we can retry for one full backoff cycle and then break
+ if !isDeviceReconciledErr {
+ isDeviceReconciledErr = true
+ reconcilingBackoff.Reset()
}
- break retry
-
+ } else {
+ isDeviceReconciledErr = false
}
-
}
if err != nil {
agent.logDeviceUpdate(ctx, nil, nil, requestStatus, err, desc)
- <-backoffTimer.C
- // backoffTimer expired continue
- // Take lock back before retrying
- if err := agent.requestQueue.WaitForGreenLight(ctx); err != nil {
- agent.logDeviceUpdate(ctx, nil, nil, requestStatus, err, desc)
- break retry
+ select {
+ case <-backoffTimer.C:
+ // backoffTimer expired continue
+ // Take lock back before retrying
+ if err := agent.requestQueue.WaitForGreenLight(ctx); err != nil {
+ desc = "failed-to-acquire-lock"
+ agent.logDeviceUpdate(ctx, nil, nil, requestStatus, err, desc)
+ break retry
+ }
+ continue
+
+ case _, ok := (<-agent.exitChannel):
+ // If the device is deleted, the agent would be stopped. Check for agent status before retrying the reconcile
+ if !ok {
+ logger.Warnw(ctx, "device agent stopped, aborting reconcile", log.Fields{"device-id": agent.deviceID})
+ desc = "device-agent-stopped"
+ agent.logDeviceUpdate(ctx, nil, nil, requestStatus, err, desc)
+ break retry
+ }
}
- continue
}
// Success
requestStatus = &common.OperationResp{Code: common.OperationResp_OPERATION_SUCCESS}
@@ -1625,6 +1660,18 @@
}
}
+func (agent *Agent) ReconcileDevice(ctx context.Context) {
+
+ //set transient state to RECONCILE IN PROGRESS
+ err := agent.UpdateTransientStateToReconcile(ctx)
+ if err != nil {
+ logger.Errorw(ctx, "check-and-update-transient-state-failed", log.Fields{"error": err})
+ return
+ }
+
+ agent.StartReconcileWithRetry(ctx)
+}
+
func (agent *Agent) sendReconcileRequestToAdapter(ctx context.Context, device *voltha.Device) error {
logger.Debugw(ctx, "sending-reconcile-to-adapter", log.Fields{"device-id": device.Id, "endpoint": agent.adapterEndpoint})
client, err := agent.adapterMgr.GetAdapterClient(ctx, agent.adapterEndpoint)
diff --git a/rw_core/core/device/manager.go b/rw_core/core/device/manager.go
index 1f6b59f..f09ddb3 100755
--- a/rw_core/core/device/manager.go
+++ b/rw_core/core/device/manager.go
@@ -19,10 +19,11 @@
import (
"context"
"fmt"
- "github.com/opencord/voltha-protos/v5/go/common"
"sync"
"time"
+ "github.com/opencord/voltha-protos/v5/go/common"
+
"github.com/opencord/voltha-go/rw_core/config"
"github.com/opencord/voltha-lib-go/v7/pkg/probe"
"github.com/opencord/voltha-protos/v5/go/core"
@@ -387,12 +388,36 @@
"current-replica": adapter.CurrentReplica, "total-replicas": adapter.TotalReplicas,
"restarted-endpoint": adapter.Endpoint, "current-version": adapter.Version})
- numberOfDevicesToReconcile := 0
dMgr.deviceAgents.Range(func(key, value interface{}) bool {
deviceAgent, ok := value.(*Agent)
if ok && deviceAgent.adapterEndpoint == adapter.Endpoint {
// Before reconciling, abort in-process request
if err := deviceAgent.abortAllProcessing(utils.WithNewSpanAndRPCMetadataContext(ctx, "AbortProcessingOnRestart")); err == nil {
+ logger.Debugw(ctx, "setting transiet state",
+ log.Fields{
+ "device-id": deviceAgent.deviceID,
+ "root-device": deviceAgent.isRootDevice,
+ "restarted-endpoint": adapter.Endpoint,
+ "device-type": deviceAgent.deviceType,
+ "adapter-type": adapter.Type,
+ })
+ //set transient state to RECONCILE IN PROGRESS
+ err := deviceAgent.UpdateTransientStateToReconcile(ctx)
+ if err != nil {
+ logger.Errorw(ctx, "setting-transient-state-failed", log.Fields{"error": err})
+ }
+ } else {
+ logger.Errorw(ctx, "failed-aborting-exisiting-processing", log.Fields{"error": err})
+ }
+ }
+ return true
+ })
+
+ go func() {
+ numberOfDevicesToReconcile := 0
+ dMgr.deviceAgents.Range(func(key, value interface{}) bool {
+ deviceAgent, ok := value.(*Agent)
+ if ok && deviceAgent.adapterEndpoint == adapter.Endpoint {
logger.Debugw(ctx, "reconciling-device",
log.Fields{
"device-id": deviceAgent.deviceID,
@@ -401,15 +426,14 @@
"device-type": deviceAgent.deviceType,
"adapter-type": adapter.Type,
})
- deviceAgent.ReconcileDevice(utils.WithNewSpanAndRPCMetadataContext(ctx, "ReconcileDevice"))
+ go deviceAgent.StartReconcileWithRetry(utils.WithNewSpanAndRPCMetadataContext(ctx, "ReconcileDevice"))
numberOfDevicesToReconcile++
- } else {
- logger.Errorw(ctx, "failed-aborting-exisiting-processing", log.Fields{"error": err})
}
- }
- return true
- })
- logger.Debugw(ctx, "reconciling-on-adapter-restart-initiated", log.Fields{"adapter-endpoint": adapter.Endpoint, "number-of-devices-to-reconcile": numberOfDevicesToReconcile})
+ return true
+ })
+ logger.Debug(ctx, "reconciling-on-adapter-restart-initiated", log.Fields{"adapter-endpoint": adapter.Endpoint, "number-of-devices-to-reconcile": numberOfDevicesToReconcile})
+ }()
+
return nil
}
@@ -671,8 +695,8 @@
}
/*
-All the functions below are callback functions where they are invoked with the latest and previous data. We can
-therefore use the data as is without trying to get the latest from the model.
+ All the functions below are callback functions where they are invoked with the latest and previous data. We can
+ therefore use the data as is without trying to get the latest from the model.
*/
// DisableAllChildDevices is invoked as a callback when the parent device is disabled