[VOL-5404]-retry introduced for failed child device

Change-Id: I5ce67b1493f832788a23951ecd5815a3f5dc3234
Signed-off-by: aksoni <akash.soni@radisys.com>
diff --git a/VERSION b/VERSION
index 26664c7..bdf6e02 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-3.6.3-dev3
+3.6.3-dev4
diff --git a/rw_core/core/device/manager_state_callback.go b/rw_core/core/device/manager_state_callback.go
index 7389e45..b64c10a 100644
--- a/rw_core/core/device/manager_state_callback.go
+++ b/rw_core/core/device/manager_state_callback.go
@@ -17,7 +17,6 @@
 
 import (
 	"context"
-	"errors"
 
 	"github.com/opencord/voltha-lib-go/v7/pkg/log"
 	"github.com/opencord/voltha-protos/v5/go/core"
@@ -123,28 +122,52 @@
 	if agent == nil {
 		return status.Errorf(codes.NotFound, "%s", parentCurrDevice.Id)
 	}
+	// The retry mechanism ensures that failed child device deletions are re-attempted.
+	// We continue retrying the deletion process until all child devices are successfully deleted.
+	childDeviceIDs := dMgr.getAllChildDeviceIds(ctx, parentCurrDevice.Id)
+	if len(childDeviceIDs) == 0 {
+		logger.Debugw(ctx, "no-child-devices-to-delete", log.Fields{"parent-device-id": parentCurrDevice.Id})
+		return nil
+	}
 
-	for childDeviceID := range dMgr.getAllChildDeviceIds(ctx, parentCurrDevice.Id) {
-		if agent := dMgr.getDeviceAgent(ctx, childDeviceID); agent != nil {
-			logger.Debugw(ctx, "invoking-delete-device", log.Fields{"device-id": childDeviceID, "parent-device-id": parentCurrDevice.Id})
-			if err := agent.deleteDeviceForce(ctx); err != nil {
-				logger.Warnw(ctx, "delete-device-force-failed", log.Fields{"device-id": childDeviceID, "parent-device-id": parentCurrDevice.Id,
-					"error": err})
-				// We got an error - if its a connection error we should just mark the device as delete failed and
-				// when connection is established then proceed with the deletion instead of reconciling the device.
-				// A DeviceTransientState_DELETE_FAILED does not perform any state transition
-				if errors.Is(err, errNoConnection) {
+	retryNeeded := true
+
+	// Keep retrying until all devices are deleted
+	for retryNeeded {
+		retryNeeded = false
+
+		for childDeviceID := range childDeviceIDs {
+			if agent := dMgr.getDeviceAgent(ctx, childDeviceID); agent != nil {
+				logger.Debugw(ctx, "invoking-delete-device", log.Fields{"device-id": childDeviceID, "parent-device-id": parentCurrDevice.Id})
+
+				// Attempt to delete the device forcefully. If it fails, we mark it for retry.
+				if err := agent.deleteDeviceForce(ctx); err != nil {
+					logger.Warnw(ctx, "delete-device-force-failed", log.Fields{
+						"device-id":        childDeviceID,
+						"parent-device-id": parentCurrDevice.Id,
+						"error":            err,
+					})
+
+					// On failure, mark the device as DELETE_FAILED to trigger a retry.
 					if err = agent.updateTransientState(ctx, core.DeviceTransientState_DELETE_FAILED); err != nil {
-						logger.Warnw(ctx, "failed-updating-transient-state", log.Fields{"device-id": childDeviceID, "parent-device-id": parentCurrDevice.Id,
-							"error": err})
+						logger.Warnw(ctx, "failed-updating-transient-state", log.Fields{
+							"device-id":        childDeviceID,
+							"parent-device-id": parentCurrDevice.Id,
+							"error":            err,
+						})
 					}
-					logger.Debugw(ctx, "device-set-to-delete-failed", log.Fields{"device-id": childDeviceID, "parent-device-id": parentCurrDevice.Id})
+
+					retryNeeded = true
+				} else {
+					// On successful deletion, remove the child device ID from the map.
+					delete(childDeviceIDs, childDeviceID)
+					logger.Debugw(ctx, "child-device-deleted", log.Fields{"device-id": childDeviceID})
 				}
 			}
-			// No further action is required here.  The deleteDevice will change the device state where the resulting
-			// callback will take care of cleaning the child device agent.
 		}
 	}
+
+	logger.Debugw(ctx, "all-child-devices-deleted", log.Fields{"parent-device-id": parentCurrDevice.Id})
 	return nil
 }