VOL-4209 - halt retries on reconcile failure

Change-Id: Ib85f08d65a5d19ea91726fda1fe094ef9afadfcd
diff --git a/rw_core/core/device/agent.go b/rw_core/core/device/agent.go
index faf3845..9cd49f0 100755
--- a/rw_core/core/device/agent.go
+++ b/rw_core/core/device/agent.go
@@ -1314,7 +1314,22 @@
 	agent.stopReconciling = make(chan int)
 	agent.stopReconcilingMutex.Unlock()
 
+	// defined outside the retry loop so it can be cleaned
+	// up when the loop breaks
+	var backoffTimer *time.Timer
+
+retry:
 	for {
+		// If the operations state of the device is RECONCILING_FAILED then we do not
+		// want to continue to attempt reconciliation.
+		deviceRef := agent.getDeviceReadOnlyWithoutLock()
+		if deviceRef.OperStatus == common.OperStatus_RECONCILING_FAILED {
+			logger.Warnw(ctx, "reconciling-failed-halting-retries",
+				log.Fields{"device-id": device.Id})
+			agent.requestQueue.RequestComplete()
+			break retry
+		}
+
 		// Use an exponential back off to prevent getting into a tight loop
 		duration := reconcilingBackoff.NextBackOff()
 		//This case should never occur in default case as max elapsed time for backoff is 0(by default) , so it will never return stop
@@ -1328,7 +1343,7 @@
 			duration = reconcilingBackoff.NextBackOff()
 		}
 
-		backoffTimer := time.NewTimer(duration)
+		backoffTimer = time.NewTimer(duration)
 
 		logger.Debugw(ctx, "retrying-reconciling", log.Fields{"deviceID": device.Id})
 		// Send a reconcile request to the adapter.
@@ -1344,7 +1359,7 @@
 			if err := agent.requestQueue.WaitForGreenLight(ctx); err != nil {
 				desc = err.Error()
 				agent.logDeviceUpdate(ctx, "Reconciling", nil, nil, operStatus, &desc)
-				return
+				break retry
 			}
 			continue
 		}
@@ -1358,17 +1373,31 @@
 		} else {
 			operStatus = &common.OperationResp{Code: common.OperationResp_OPERATION_IN_PROGRESS}
 			agent.logDeviceUpdate(ctx, "Reconciling", nil, nil, operStatus, &desc)
-			if !backoffTimer.Stop() {
-				<-backoffTimer.C
-			}
-			return
+			break retry
 		}
 
 		// Take lock back before retrying
 		if err := agent.requestQueue.WaitForGreenLight(ctx); err != nil {
 			desc = err.Error()
 			agent.logDeviceUpdate(ctx, "Reconciling", nil, nil, operStatus, &desc)
-			return
+			break retry
+		}
+	}
+
+	// Retry loop is broken, so stop any timers and drain the channel
+	if backoffTimer != nil && !backoffTimer.Stop() {
+
+		// As per documentation and stack overflow when a timer is stopped its
+		// channel should be drained. The issue is that Stop returns false
+		// either if the timer has already been fired "OR" if the timer can be
+		// stopped before being fired. This means that in some cases the
+		// channel has already be emptied so attempting to read from it means
+		// a blocked thread. To get around this use a select so if the
+		// channel is already empty the default case hits and we are not
+		// blocked.
+		select {
+		case <-backoffTimer.C:
+		default:
 		}
 	}
 }