VOL-5223:Alarm Mgr has to handle unconfigured MEs responses from ONT during Alarm Audit

Change-Id: I9d15232ab63c839d9d6e16ce342f9b8bbe0a3b21
diff --git a/internal/pkg/almgr/alarm_manager.go b/internal/pkg/almgr/alarm_manager.go
index cc6c898..cb9019b 100755
--- a/internal/pkg/almgr/alarm_manager.go
+++ b/internal/pkg/almgr/alarm_manager.go
@@ -33,6 +33,8 @@
 	cmn "github.com/opencord/voltha-openonu-adapter-go/internal/pkg/common"
 	"github.com/opencord/voltha-protos/v5/go/extension"
 	"github.com/opencord/voltha-protos/v5/go/voltha"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
 )
 
 const (
@@ -250,13 +252,8 @@
 }
 func (am *OnuAlarmManager) asFsmLeaveAuditing(ctx context.Context, e *fsm.Event) {
 	logger.Debugw(ctx, "alarm-sync-fsm-leave-auditing state", log.Fields{"state": e.FSM.Current(), "device-id": am.deviceID})
-	if am.isAsyncAlarmRequest {
-		logger.Errorw(ctx, "alarm-sync-fsm-leave-auditing state process the updated ONU alarms ", log.Fields{"state": e.FSM.Current(), "device-id": am.deviceID})
-		am.AsyncAlarmsCommChan <- struct{}{}
-		am.isAsyncAlarmRequest = false
-
-	}
 }
+
 func (am *OnuAlarmManager) asFsmResynchronizing(ctx context.Context, e *fsm.Event) {
 	logger.Debugw(ctx, "alarm-sync-fsm", log.Fields{"state": e.FSM.Current(), "device-id": am.deviceID})
 	failureTransition := func() {
@@ -264,80 +261,27 @@
 			logger.Debugw(ctx, "alarm-sync-fsm-cannot-go-to-state-failure", log.Fields{"device-id": am.deviceID, "err": err})
 		}
 	}
-	// See if there is any onu only diff, meaning the class and entity is only in onu DB
-	for alarm := range am.onuDBCopy {
-		if _, exists := am.oltDbCopy[meAlarmKey{
-			classID:    alarm.classID,
-			instanceID: alarm.instanceID,
-		}]; !exists {
-			// We need to raise all such alarms as OLT wont have received notification for these alarms
-			omciAlarmMessage := &omci.AlarmNotificationMsg{
-				MeBasePacket: omci.MeBasePacket{
-					EntityClass:    alarm.classID,
-					EntityInstance: alarm.instanceID,
-				},
-				AlarmBitmap: am.onuDBCopy[alarm],
-			}
-			if err := am.processAlarmData(ctx, omciAlarmMessage); err != nil {
-				logger.Errorw(ctx, "unable-to-process-alarm-notification", log.Fields{"device-id": am.deviceID})
-				// Transition to failure.
-				go failureTransition()
-				return
-			}
-		}
+
+	// Process onu only differences
+	if err := am.processOnuOnlyDifferences(ctx, failureTransition); err != nil {
+		return
 	}
-	// See if there is any olt only diff, meaning the class and entity is only in olt DB
-	for alarm := range am.oltDbCopy {
-		if _, exists := am.onuDBCopy[meAlarmKey{
-			classID:    alarm.classID,
-			instanceID: alarm.instanceID,
-		}]; !exists {
-			// We need to clear all such alarms as OLT might have stale data and the alarms are already cleared.
-			omciAlarmMessage := &omci.AlarmNotificationMsg{
-				MeBasePacket: omci.MeBasePacket{
-					EntityClass:    alarm.classID,
-					EntityInstance: alarm.instanceID,
-				},
-				AlarmBitmap: am.oltDbCopy[alarm],
-			}
-			if err := am.processAlarmData(ctx, omciAlarmMessage); err != nil {
-				logger.Errorw(ctx, "unable-to-process-alarm-notification", log.Fields{"device-id": am.deviceID})
-				// Transition to failure
-				go failureTransition()
-				return
-			}
-		}
+
+	// Process olt only differences
+	if err := am.processOltOnlyDifferences(ctx, failureTransition); err != nil {
+		return
 	}
-	// See if there is any attribute difference
-	for alarm := range am.onuDBCopy {
-		if _, exists := am.oltDbCopy[alarm]; exists {
-			if am.onuDBCopy[alarm] != am.oltDbCopy[alarm] {
-				omciAlarmMessage := &omci.AlarmNotificationMsg{
-					MeBasePacket: omci.MeBasePacket{
-						EntityClass:    alarm.classID,
-						EntityInstance: alarm.instanceID,
-					},
-					AlarmBitmap: am.onuDBCopy[alarm],
-				}
-				// We will assume that onudb is correct always in this case and process the changed bitmap.
-				if err := am.processAlarmData(ctx, omciAlarmMessage); err != nil {
-					logger.Errorw(ctx, "unable-to-process-alarm-notification", log.Fields{"device-id": am.deviceID})
-					// Transition to failure
-					go failureTransition()
-					return
-				}
-			}
-		}
+
+	// Process attribute differences
+	if err := am.processAttributeDifferences(ctx, failureTransition); err != nil {
+		return
 	}
-	// Send the buffered notifications if no failure.
-	for _, notif := range am.bufferedNotifications {
-		logger.Debugw(ctx, "processing-buffered-alarm-notification", log.Fields{"device-id": am.deviceID,
-			"notification": notif})
-		if err := am.processAlarmData(ctx, notif); err != nil {
-			logger.Errorw(ctx, "unable-to-process-alarm-notification", log.Fields{"device-id": am.deviceID})
-			go failureTransition()
-		}
+
+	// Process buffered notifications
+	if err := am.processBufferedNotifications(ctx, failureTransition); err != nil {
+		return
 	}
+
 	go func() {
 		if err := am.AlarmSyncFsm.PFsm.Event(AsEvSuccess); err != nil {
 			logger.Debugw(ctx, "alarm-sync-fsm-cannot-go-to-state-sync", log.Fields{"device-id": am.deviceID, "err": err})
@@ -347,6 +291,14 @@
 
 func (am *OnuAlarmManager) asFsmInSync(ctx context.Context, e *fsm.Event) {
 	logger.Debugw(ctx, "alarm-sync-fsm", log.Fields{"state": e.FSM.Current(), "device-id": am.deviceID})
+
+	if am.isAsyncAlarmRequest {
+		logger.Debugw(ctx, "alarm-sync-fsm-before entering the sync state process the updated ONU alarms ", log.Fields{"state": e.FSM.Current(), "device-id": am.deviceID})
+		am.AsyncAlarmsCommChan <- struct{}{}
+		am.isAsyncAlarmRequest = false
+
+	}
+
 	if am.pDeviceHandler.GetAlarmAuditInterval() > 0 {
 		select {
 		case <-time.After(am.pDeviceHandler.GetAlarmAuditInterval()):
@@ -370,6 +322,16 @@
 			}()
 
 		}
+	} else {
+		<-am.AsyncAlarmsCommChan
+		go func() {
+			logger.Debugw(ctx, "On demand Auditing the ONU for Alarms  ", log.Fields{"device-id": am.deviceID})
+			if err := am.AlarmSyncFsm.PFsm.Event(AsEvAudit); err != nil {
+				logger.Errorw(ctx, "alarm-sync-fsm-cannot-go-to-state-auditing, use current snapshot of alarms", log.Fields{"device-id": am.deviceID, "err": err})
+				am.isAsyncAlarmRequest = false
+				am.AsyncAlarmsCommChan <- struct{}{}
+			}
+		}()
 	}
 }
 
@@ -632,12 +594,12 @@
 	if !am.processMessage {
 		logger.Warnw(ctx, "ignoring-alarm-notification-received-for-me-as-channel-for-processing-is-closed",
 			log.Fields{"device-id": am.deviceID})
-		return fmt.Errorf("alarm-manager-is-in-stopped-state")
+		return status.Error(codes.Unavailable, "alarm-manager-is-in-stopped-state")
 	}
 	if _, present := am.pOnuDeviceEntry.GetOnuDB().MeDb[classID][meInstance]; !present {
 		logger.Errorw(ctx, "me-class-instance-not-present",
 			log.Fields{"class-id": classID, "instance-id": meInstance, "device-id": am.deviceID})
-		return fmt.Errorf("me-class-%d-instance-%d-not-present", classID, meInstance)
+		return status.Error(codes.NotFound, "me-class-instance-not-present")
 	}
 	if sequenceNo > 0 {
 		if am.AlarmSyncFsm.PFsm.Is(asStAuditing) || am.AlarmSyncFsm.PFsm.Is(asStResynchronizing) {
@@ -661,12 +623,12 @@
 	if omciErr.StatusCode() != me.Success {
 		//log error and return
 		logger.Error(ctx, "unable-to-get-managed-entity", log.Fields{"class-id": classID, "instance-id": meInstance})
-		return fmt.Errorf("unable-to-get-managed-entity-class-%d-instance-%d", classID, meInstance)
+		return status.Error(codes.NotFound, "unable-to-get-managed-entity")
 	}
 	meAlarmMap := entity.GetAlarmMap()
 	if meAlarmMap == nil {
 		logger.Error(ctx, "unable-to-get-managed-entity-alarm-map", log.Fields{"class-id": classID, "instance-id": meInstance})
-		return fmt.Errorf("unable-to-get-managed-entity-alarm-map-%d-instance-%d", classID, meInstance)
+		return status.Error(codes.NotFound, "unable-to-get-managed-entity-alarm-map")
 	}
 
 	am.alarmBitMapDB[meAlarmKey{
@@ -910,3 +872,84 @@
 	am.pDeviceHandler = nil
 	am.pOnuDeviceEntry = nil
 }
+
+func (am *OnuAlarmManager) processOnuOnlyDifferences(ctx context.Context, failureTransition func()) error {
+	for alarm := range am.onuDBCopy {
+		if _, exists := am.oltDbCopy[meAlarmKey{classID: alarm.classID, instanceID: alarm.instanceID}]; !exists {
+			omciAlarmMessage := createOmciAlarmMessage(alarm, am.onuDBCopy[alarm])
+			if err := am.processAlarm(ctx, omciAlarmMessage, failureTransition); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (am *OnuAlarmManager) processOltOnlyDifferences(ctx context.Context, failureTransition func()) error {
+	for alarm := range am.oltDbCopy {
+		if _, exists := am.onuDBCopy[meAlarmKey{classID: alarm.classID, instanceID: alarm.instanceID}]; !exists {
+			omciAlarmMessage := createOmciAlarmMessage(alarm, am.oltDbCopy[alarm])
+			if err := am.processAlarm(ctx, omciAlarmMessage, failureTransition); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (am *OnuAlarmManager) processAttributeDifferences(ctx context.Context, failureTransition func()) error {
+	for alarm := range am.onuDBCopy {
+		if _, exists := am.oltDbCopy[alarm]; exists && am.onuDBCopy[alarm] != am.oltDbCopy[alarm] {
+			omciAlarmMessage := createOmciAlarmMessage(alarm, am.onuDBCopy[alarm])
+			if err := am.processAlarm(ctx, omciAlarmMessage, failureTransition); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (am *OnuAlarmManager) processBufferedNotifications(ctx context.Context, failureTransition func()) error {
+	for _, notif := range am.bufferedNotifications {
+		logger.Debugw(ctx, "processing-buffered-alarm-notification", log.Fields{"device-id": am.deviceID, "notification": notif})
+		if err := am.processAlarm(ctx, notif, failureTransition); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (am *OnuAlarmManager) processAlarm(ctx context.Context, omciAlarmMessage *omci.AlarmNotificationMsg, failureTransition func()) error {
+	// [https://jira.opencord.org/browse/VOL-5223]
+	//The following test scenarios cause AlarmMgr to get into a loop state.
+	//Test Scenario:
+	// - Unconfigured MEs being reported by vendor ONTs.
+	// - Undefined Alarm Bit Map (ONU-G ME for Example.)
+	// - MEs created by OLT as per G984.4 standard are not part of ONU DB.
+	if err := am.processAlarmData(ctx, omciAlarmMessage); err != nil {
+		if statusErr, ok := status.FromError(err); ok {
+			switch statusErr.Code() {
+			case codes.NotFound:
+				logger.Warnw(ctx, "ME Instance or ME Alarm Map not found in ONUDB", log.Fields{"device-id": am.deviceID, "Error": err})
+			case codes.Unavailable:
+				logger.Warnw(ctx, "Alarm Mgr is stopped, stop further processing", log.Fields{"device-id": am.deviceID, "Error": err})
+				return err
+			default:
+				logger.Errorw(ctx, "Unexpected error", log.Fields{"device-id": am.deviceID, "Error": err})
+				go failureTransition()
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func createOmciAlarmMessage(alarm meAlarmKey, alarmBitmap [alarmBitMapSizeBytes]byte) *omci.AlarmNotificationMsg {
+	return &omci.AlarmNotificationMsg{
+		MeBasePacket: omci.MeBasePacket{
+			EntityClass:    alarm.classID,
+			EntityInstance: alarm.instanceID,
+		},
+		AlarmBitmap: alarmBitmap,
+	}
+}