[VOL-4421] Execute delete force following an adapater restart
Change-Id: I1f27568ac5587740682ce39eaac86a4e813973e7
diff --git a/rw_core/core/device/agent.go b/rw_core/core/device/agent.go
index 818b976..0713f20 100755
--- a/rw_core/core/device/agent.go
+++ b/rw_core/core/device/agent.go
@@ -33,6 +33,7 @@
"github.com/gogo/protobuf/proto"
"github.com/golang/protobuf/ptypes/empty"
"github.com/opencord/voltha-go/rw_core/config"
+ "github.com/opencord/voltha-go/rw_core/utils"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
@@ -53,6 +54,7 @@
var errReconcileAborted = errors.New("reconcile aborted")
var errContextExpired = errors.New("context expired")
+var errNoConnection = errors.New("no connection")
// Agent represents device agent attributes
type Agent struct {
@@ -658,7 +660,7 @@
"adapter-endpoint": device.AdapterEndpoint,
})
agent.requestQueue.RequestComplete()
- return err
+ return fmt.Errorf("remote-not-reachable %w", errNoConnection)
}
subCtx, cancel := context.WithTimeout(coreutils.WithAllMetadataFromContext(ctx), agent.rpcTimeout)
requestStatus.Code = common.OperationResp_OPERATION_IN_PROGRESS
@@ -1352,6 +1354,9 @@
updatedState = core.DeviceTransientState_DELETE_FAILED
case core.DeviceTransientState_DELETING_FROM_ADAPTER:
updatedState = core.DeviceTransientState_DELETE_FAILED
+ case core.DeviceTransientState_DELETE_FAILED:
+ // do not change state
+ return nil
default:
updatedState = core.DeviceTransientState_NONE
}
@@ -1362,7 +1367,91 @@
return nil
}
+func (agent *Agent) DeleteDevicePostAdapterRestart(ctx context.Context) error {
+ logger.Debugw(ctx, "delete-post-restart", log.Fields{"device-id": agent.deviceID})
+ ctx = utils.WithNewSpanAndRPCMetadataContext(ctx, "DelteDevicePostAdapterRestart")
+
+ if err := agent.requestQueue.WaitForGreenLight(ctx); err != nil {
+ return err
+ }
+
+ device := agent.getDeviceReadOnlyWithoutLock()
+ if device.AdminState == voltha.AdminState_PREPROVISIONED {
+ logger.Debugw(ctx, "device-in-preprovisioning-state-reconcile-not-needed", log.Fields{"device-id": device.Id})
+ agent.requestQueue.RequestComplete()
+ return nil
+ }
+ // Change device transient state to FORCE_DELETING
+ if err := agent.updateTransientState(ctx, core.DeviceTransientState_FORCE_DELETING); err != nil {
+ logger.Errorw(ctx, "failure-updating-transient-state", log.Fields{"error": err, "device-id": agent.deviceID})
+ agent.requestQueue.RequestComplete()
+ return err
+ }
+
+ // Ensure we have a valid grpc client available as we have just restarted
+ deleteBackoff := backoff.NewExponentialBackOff()
+ deleteBackoff.InitialInterval = agent.config.BackoffRetryInitialInterval
+ deleteBackoff.MaxElapsedTime = agent.config.BackoffRetryMaxElapsedTime
+ deleteBackoff.MaxInterval = agent.config.BackoffRetryMaxInterval
+ var backoffTimer *time.Timer
+ var err error
+ var client adapter_service.AdapterServiceClient
+retry:
+ for {
+ client, err = agent.adapterMgr.GetAdapterClient(ctx, agent.adapterEndpoint)
+ if err == nil {
+ break retry
+ }
+ duration := deleteBackoff.NextBackOff()
+ if duration == backoff.Stop {
+ deleteBackoff.Reset()
+ duration = deleteBackoff.NextBackOff()
+ }
+ backoffTimer = time.NewTimer(duration)
+ select {
+ case <-backoffTimer.C:
+ logger.Debugw(ctx, "backoff-timer-expires", log.Fields{"device-id": agent.deviceID})
+ case <-ctx.Done():
+ err = ctx.Err()
+ break retry
+ }
+ }
+ if backoffTimer != nil && !backoffTimer.Stop() {
+ select {
+ case <-backoffTimer.C:
+ default:
+ }
+ }
+ if err != nil || client == nil {
+ agent.requestQueue.RequestComplete()
+ return err
+ }
+
+ // Release the device lock to allow for device state update, if any
+ agent.requestQueue.RequestComplete()
+
+ // Send the delete request to the adapter
+ subCtx, cancel := context.WithTimeout(coreutils.WithAllMetadataFromContext(ctx), agent.rpcTimeout)
+ defer cancel()
+ if _, err = client.DeleteDevice(subCtx, device); err != nil {
+ agent.onDeleteFailure(subCtx, err, nil, nil)
+ } else {
+ agent.onDeleteSuccess(subCtx, nil, nil)
+ }
+ return nil
+}
+
func (agent *Agent) ReconcileDevice(ctx context.Context) {
+ // Do not reconcile if the device was in DELETE_FAILED transient state. Just invoke the force delete on that device.
+ state := agent.getTransientState()
+ logger.Debugw(ctx, "starting-reconcile", log.Fields{"device-id": agent.deviceID, "state": state})
+ if agent.getTransientState() == core.DeviceTransientState_DELETE_FAILED {
+ if err := agent.DeleteDevicePostAdapterRestart(ctx); err != nil {
+ logger.Errorw(ctx, "delete-post-restart-failed", log.Fields{"error": err, "device-id": agent.deviceID})
+ }
+ return
+ }
+
requestStatus := &common.OperationResp{Code: common.OperationResp_OPERATION_FAILURE}
var desc string