[VOL-4469] Onu adapter reconcilement may stuck on VLAN processing, especially in TT traffic scenarios
Signed-off-by: mpagenko <michael.pagenkopf@adtran.com>
Change-Id: If321388c67e2e52eb04b8a55167eb3c1c7575e5d
diff --git a/internal/pkg/core/device_handler.go b/internal/pkg/core/device_handler.go
index f1005e6..0203575 100755
--- a/internal/pkg/core/device_handler.go
+++ b/internal/pkg/core/device_handler.go
@@ -55,8 +55,15 @@
"github.com/opencord/voltha-protos/v5/go/voltha"
)
-// Constants for timeouts
const (
+ //constants for reconcile flow check channel
+ cWaitReconcileFlowAbortOnSuccess = 0xFFFD
+ cWaitReconcileFlowAbortOnError = 0xFFFE
+ cWaitReconcileFlowNoActivity = 0xFFFF
+)
+
+const (
+ // constants for timeouts
cTimeOutRemoveUpgrade = 1 //for usage in seconds
)
@@ -180,30 +187,33 @@
//discOnus sync.Map
//onus sync.Map
//portStats *OpenOltStatisticsMgr
- collectorIsRunning bool
- mutexCollectorFlag sync.RWMutex
- stopCollector chan bool
- alarmManagerIsRunning bool
- mutextAlarmManagerFlag sync.RWMutex
- stopAlarmManager chan bool
- stopHeartbeatCheck chan bool
- uniEntityMap cmn.OnuUniPortMap
- mutexKvStoreContext sync.Mutex
- lockVlanConfig sync.RWMutex
- lockVlanAdd sync.RWMutex
- UniVlanConfigFsmMap map[uint8]*avcfg.UniVlanConfigFsm
- lockUpgradeFsm sync.RWMutex
- pOnuUpradeFsm *swupg.OnuUpgradeFsm
- upgradeCanceled bool
- reconciling uint8
- mutexReconcilingFlag sync.RWMutex
- chReconcilingFinished chan bool //channel to indicate that reconciling has been finished
- mutexReadyForOmciConfig sync.RWMutex
- readyForOmciConfig bool
- deletionInProgress bool
- mutexDeletionInProgressFlag sync.RWMutex
- pLastUpgradeImageState *voltha.ImageState
- upgradeFsmChan chan struct{}
+ collectorIsRunning bool
+ mutexCollectorFlag sync.RWMutex
+ stopCollector chan bool
+ alarmManagerIsRunning bool
+ mutextAlarmManagerFlag sync.RWMutex
+ stopAlarmManager chan bool
+ stopHeartbeatCheck chan bool
+ uniEntityMap cmn.OnuUniPortMap
+ mutexKvStoreContext sync.Mutex
+ lockVlanConfig sync.RWMutex
+ lockVlanAdd sync.RWMutex
+ UniVlanConfigFsmMap map[uint8]*avcfg.UniVlanConfigFsm
+ lockUpgradeFsm sync.RWMutex
+ pOnuUpradeFsm *swupg.OnuUpgradeFsm
+ upgradeCanceled bool
+ reconciling uint8
+ mutexReconcilingFlag sync.RWMutex
+ chUniVlanConfigReconcilingDone chan uint16 //channel to indicate that VlanConfig reconciling for a specific UNI has been finished
+ chReconcilingFinished chan bool //channel to indicate that reconciling has been finished
+ reconcileExpiryComplete time.Duration
+ reconcileExpiryVlanConfig time.Duration
+ mutexReadyForOmciConfig sync.RWMutex
+ readyForOmciConfig bool
+ deletionInProgress bool
+ mutexDeletionInProgressFlag sync.RWMutex
+ pLastUpgradeImageState *voltha.ImageState
+ upgradeFsmChan chan struct{}
flowCbChan []chan FlowCb
mutexFlowMonitoringRoutineFlag sync.RWMutex
@@ -239,7 +249,16 @@
dh.lockUpgradeFsm = sync.RWMutex{}
dh.UniVlanConfigFsmMap = make(map[uint8]*avcfg.UniVlanConfigFsm)
dh.reconciling = cNoReconciling
+ dh.chUniVlanConfigReconcilingDone = make(chan uint16)
dh.chReconcilingFinished = make(chan bool)
+ dh.reconcileExpiryComplete = adapter.maxTimeoutReconciling //assumption is to have it as duration in s!
+ rECSeconds := int(dh.reconcileExpiryComplete / time.Second)
+ if rECSeconds < 2 {
+ dh.reconcileExpiryComplete = time.Duration(2) * time.Second //ensure a minimum expiry time of 2s for complete reconciling
+ rECSeconds = 2
+ }
+ rEVCSeconds := rECSeconds / 2
+ dh.reconcileExpiryVlanConfig = time.Duration(rEVCSeconds) * time.Second //set this duration to some according lower value
dh.readyForOmciConfig = false
dh.deletionInProgress = false
dh.pLastUpgradeImageState = &voltha.ImageState{
@@ -812,7 +831,7 @@
} else {
logger.Errorw(ctx, "reconciling - restoring OnuTp-data failed - abort", log.Fields{"err": err, "device-id": dh.DeviceID})
}
- dh.StopReconciling(ctx, false)
+ dh.stopReconciling(ctx, false, cWaitReconcileFlowNoActivity)
return
}
var onuIndication oop.OnuIndication
@@ -832,7 +851,7 @@
if pDevEntry == nil {
logger.Errorw(ctx, "No valid OnuDevice - aborting", log.Fields{"device-id": dh.DeviceID})
if !dh.IsSkipOnuConfigReconciling() {
- dh.StopReconciling(ctx, false)
+ dh.stopReconciling(ctx, false, cWaitReconcileFlowNoActivity)
}
return
}
@@ -846,7 +865,7 @@
logger.Debugw(ctx, "reconciling - no uni-configs have been stored before adapter restart - terminate reconcilement",
log.Fields{"device-id": dh.DeviceID})
if !dh.IsSkipOnuConfigReconciling() {
- dh.StopReconciling(ctx, true)
+ dh.stopReconciling(ctx, true, cWaitReconcileFlowNoActivity)
}
return
}
@@ -934,13 +953,13 @@
logger.Debugw(ctx, "reconciling - no TPs have been stored before adapter restart - terminate reconcilement",
log.Fields{"device-id": dh.DeviceID})
if !dh.IsSkipOnuConfigReconciling() {
- dh.StopReconciling(ctx, true)
+ dh.stopReconciling(ctx, true, cWaitReconcileFlowNoActivity)
}
return
}
if abTechProfInstLoadFailed {
dh.SetDeviceReason(cmn.DrTechProfileConfigDownloadFailed)
- dh.StopReconciling(ctx, false)
+ dh.stopReconciling(ctx, false, cWaitReconcileFlowNoActivity)
return
} else if dh.IsSkipOnuConfigReconciling() {
dh.SetDeviceReason(cmn.DrTechProfileConfigDownloadSuccess)
@@ -949,7 +968,7 @@
logger.Debugw(ctx, "reconciling - no flows have been stored before adapter restart - terminate reconcilement",
log.Fields{"device-id": dh.DeviceID})
if !dh.IsSkipOnuConfigReconciling() {
- dh.StopReconciling(ctx, true)
+ dh.stopReconciling(ctx, true, cWaitReconcileFlowNoActivity)
}
}
}
@@ -961,8 +980,13 @@
if pDevEntry == nil {
logger.Errorw(ctx, "No valid OnuDevice - aborting", log.Fields{"device-id": dh.DeviceID})
if !dh.IsSkipOnuConfigReconciling() {
- dh.StopReconciling(ctx, false)
+ dh.stopReconciling(ctx, false, cWaitReconcileFlowNoActivity)
}
+ //else we don't stop the device handler reconciling in constellation with omci configuration
+ // to avoid unintented state update to rwCore due to still running background processes
+ // such is e.g. possible in TT scenarios with multiple techProfiles as currently the end of processing
+ // of all techProfiles is not awaited (ready on first TP done event)
+ // (applicable to all according code points below)
return
}
@@ -972,11 +996,14 @@
logger.Debugw(ctx, "reconciling - no uni-configs have been stored before adapter restart - terminate reconcilement",
log.Fields{"device-id": dh.DeviceID})
if !dh.IsSkipOnuConfigReconciling() {
- dh.StopReconciling(ctx, true)
+ dh.stopReconciling(ctx, true, cWaitReconcileFlowNoActivity)
}
return
}
flowsFound := false
+ var uniVlanConfigEntries []uint8
+ var loWaitGroupWTO cmn.WaitGroupWithTimeOut
+
for _, uniData := range pDevEntry.SOnuPersistentData.PersUniConfig {
//TODO: check for uni-port specific reconcilement in case of multi-uni-port-per-onu-support
if len(uniData.PersFlowParams) == 0 {
@@ -985,7 +1012,7 @@
continue
}
if len(uniData.PersTpPathMap) == 0 {
- logger.Warnw(ctx, "reconciling - flows but no TPs stored for uniID",
+ logger.Warnw(ctx, "reconciling flows - but no TPs stored for uniID, abort",
log.Fields{"uni-id": uniData.PersUniID, "device-id": dh.DeviceID})
// It doesn't make sense to configure any flows if no TPs are available
continue
@@ -1002,46 +1029,20 @@
logger.Errorw(ctx, "reconciling - OnuUniPort data not found - terminate reconcilement",
log.Fields{"uniNo": uniNo, "device-id": dh.DeviceID})
if !dh.IsSkipOnuConfigReconciling() {
- dh.StopReconciling(ctx, false)
+ dh.stopReconciling(ctx, false, cWaitReconcileFlowNoActivity)
}
return
}
- flowsFound = true
- lastFlowToReconcile := false
- flowsProcessed := 0
- pDevEntry.SetReconcilingFlows(true)
- for _, flowData := range uniData.PersFlowParams {
- logger.Debugw(ctx, "reconciling - add flow with cookie slice", log.Fields{
- "device-id": dh.DeviceID, "uni-id": uniData.PersUniID, "cookies": flowData.CookieSlice})
- if flowsProcessed == len(uniData.PersFlowParams)-1 {
- lastFlowToReconcile = true
- }
- //the slice can be passed 'by value' here, - which internally passes its reference copy
- dh.lockVlanConfig.Lock()
- if _, exist = dh.UniVlanConfigFsmMap[uniData.PersUniID]; exist {
- if err := dh.UniVlanConfigFsmMap[uniData.PersUniID].SetUniFlowParams(ctx, flowData.VlanRuleParams.TpID,
- flowData.CookieSlice, uint16(flowData.VlanRuleParams.MatchVid), uint16(flowData.VlanRuleParams.SetVid),
- uint8(flowData.VlanRuleParams.SetPcp), lastFlowToReconcile, flowData.Meter, nil); err != nil {
- logger.Errorw(ctx, err.Error(), log.Fields{"device-id": dh.DeviceID})
- }
- } else {
- if err := dh.createVlanFilterFsm(ctx, uniPort, flowData.VlanRuleParams.TpID, flowData.CookieSlice,
- uint16(flowData.VlanRuleParams.MatchVid), uint16(flowData.VlanRuleParams.SetVid),
- uint8(flowData.VlanRuleParams.SetPcp), cmn.OmciVlanFilterAddDone, lastFlowToReconcile, flowData.Meter, nil); err != nil {
- logger.Errorw(ctx, err.Error(), log.Fields{"device-id": dh.DeviceID})
- }
- }
- dh.lockVlanConfig.Unlock()
- flowsProcessed++
- } //for all flows of this UNI
+ //needed to split up function due to sca complexity
+ dh.updateReconcileFlowConfig(ctx, uniPort, uniData.PersFlowParams, uniVlanConfigEntries, &loWaitGroupWTO, &flowsFound)
+
logger.Debugw(ctx, "reconciling - flows processed", log.Fields{
- "device-id": dh.DeviceID, "uni-id": uniData.PersUniID, "flowsProcessed": flowsProcessed,
+ "device-id": dh.DeviceID, "uni-id": uniData.PersUniID,
"NumUniFlows": dh.UniVlanConfigFsmMap[uniData.PersUniID].NumUniFlows,
"ConfiguredUniFlow": dh.UniVlanConfigFsmMap[uniData.PersUniID].ConfiguredUniFlow})
// this can't be used as global finished reconciling flag because
// assumes is getting called before the state machines for the last flow is completed,
// while this is not guaranteed.
- //dh.SetReconcilingFlows(false)
pDevEntry.MutexPersOnuConfig.RLock() //set protection again for loop test on SOnuPersistentData
} // for all UNI entries from SOnuPersistentData
pDevEntry.MutexPersOnuConfig.RUnlock()
@@ -1050,18 +1051,176 @@
logger.Debugw(ctx, "reconciling - no flows have been stored before adapter restart - terminate reconcilement",
log.Fields{"device-id": dh.DeviceID})
if !dh.IsSkipOnuConfigReconciling() {
- dh.StopReconciling(ctx, true)
+ dh.stopReconciling(ctx, true, cWaitReconcileFlowNoActivity)
}
return
}
+
if dh.IsSkipOnuConfigReconciling() {
+ //only with 'SkipOnuConfig' we need to wait for all finished-signals
+ // from vlanConfig processing of all UNI's.
+ logger.Debugw(ctx, "reconciling flows - waiting on ready indication of requested UNIs", log.Fields{
+ "device-id": dh.DeviceID, "expiry": dh.reconcileExpiryVlanConfig})
+ if executed := loWaitGroupWTO.WaitTimeout(dh.reconcileExpiryVlanConfig); executed {
+ logger.Debugw(ctx, "reconciling flows for all UNI's has been finished in time",
+ log.Fields{"device-id": dh.DeviceID})
+ dh.stopReconciling(ctx, true, cWaitReconcileFlowAbortOnSuccess)
+ if pDevEntry != nil {
+ pDevEntry.SendChReconcilingFlowsFinished(true)
+ }
+ } else {
+ logger.Errorw(ctx, "timeout waiting for reconciling flows for all UNI's to be finished!",
+ log.Fields{"device-id": dh.DeviceID})
+ dh.stopReconciling(ctx, false, cWaitReconcileFlowAbortOnError)
+ if pDevEntry != nil {
+ pDevEntry.SendChReconcilingFlowsFinished(false)
+ }
+ return
+ }
dh.SetDeviceReason(cmn.DrOmciFlowsPushed)
}
}
+func (dh *deviceHandler) updateReconcileFlowConfig(ctx context.Context, apUniPort *cmn.OnuUniPort,
+ aPersFlowParam []cmn.UniVlanFlowParams, aUniVlanConfigEntries []uint8,
+ apWaitGroup *cmn.WaitGroupWithTimeOut, apFlowsFound *bool) {
+ flowsProcessed := 0
+ lastFlowToReconcile := false
+ loUniID := apUniPort.UniID
+ for _, flowData := range aPersFlowParam {
+ if dh.IsSkipOnuConfigReconciling() {
+ if !(*apFlowsFound) {
+ *apFlowsFound = true
+ syncChannel := make(chan struct{})
+ // start go routine with select() on reconciling vlan config channel before
+ // starting vlan config reconciling process to prevent loss of any signal
+ // this routine just collects all the received 'flow-reconciled' signals - possibly from different UNI's
+ go dh.waitOnUniVlanConfigReconcilingReady(ctx, syncChannel, apWaitGroup)
+ //block until the wait routine is really blocked on channel input
+ // in order to prevent to early ready signal from VlanConfig processing
+ <-syncChannel
+ }
+ if flowsProcessed == len(aPersFlowParam)-1 {
+ var uniAdded bool
+ lastFlowToReconcile = true
+ if aUniVlanConfigEntries, uniAdded = dh.appendIfMissing(aUniVlanConfigEntries, loUniID); uniAdded {
+ apWaitGroup.Add(1) //increment the waiting group
+ }
+ }
+ }
+ // note for above block: also lastFlowToReconcile (as parameter to flow config below)
+ // is only relevant in the vlanConfig processing for IsSkipOnuConfigReconciling = true
+ logger.Debugw(ctx, "reconciling - add flow with cookie slice", log.Fields{
+ "device-id": dh.DeviceID, "uni-id": loUniID,
+ "flowsProcessed": flowsProcessed, "cookies": flowData.CookieSlice})
+ dh.lockVlanConfig.Lock()
+ //the CookieSlice can be passed 'by value' here, - which internally passes its reference
+ if _, exist := dh.UniVlanConfigFsmMap[loUniID]; exist {
+ if err := dh.UniVlanConfigFsmMap[loUniID].SetUniFlowParams(ctx, flowData.VlanRuleParams.TpID,
+ flowData.CookieSlice, uint16(flowData.VlanRuleParams.MatchVid), uint16(flowData.VlanRuleParams.SetVid),
+ uint8(flowData.VlanRuleParams.SetPcp), lastFlowToReconcile, flowData.Meter, nil); err != nil {
+ logger.Errorw(ctx, err.Error(), log.Fields{"device-id": dh.DeviceID})
+ }
+ } else {
+ if err := dh.createVlanFilterFsm(ctx, apUniPort, flowData.VlanRuleParams.TpID, flowData.CookieSlice,
+ uint16(flowData.VlanRuleParams.MatchVid), uint16(flowData.VlanRuleParams.SetVid),
+ uint8(flowData.VlanRuleParams.SetPcp), cmn.OmciVlanFilterAddDone, lastFlowToReconcile, flowData.Meter, nil); err != nil {
+ logger.Errorw(ctx, err.Error(), log.Fields{"device-id": dh.DeviceID})
+ }
+ }
+ dh.lockVlanConfig.Unlock()
+ flowsProcessed++
+ } //for all flows of this UNI
+}
+
+//waitOnUniVlanConfigReconcilingReady collects all VlanConfigReady signals from VlanConfig FSM processing in reconciling
+// and decrements the according handler wait group waiting for these indications
+func (dh *deviceHandler) waitOnUniVlanConfigReconcilingReady(ctx context.Context, aSyncChannel chan<- struct{},
+ waitGroup *cmn.WaitGroupWithTimeOut) {
+ var reconciledUniVlanConfigEntries []uint8
+ var appended bool
+ expiry := dh.GetReconcileExpiryVlanConfigAbort()
+ logger.Debugw(ctx, "start waiting on reconcile vlanConfig ready indications", log.Fields{
+ "device-id": dh.DeviceID, "expiry": expiry})
+ // indicate blocking on channel now to the caller
+ aSyncChannel <- struct{}{}
+ for {
+ select {
+ case uniIndication := <-dh.chUniVlanConfigReconcilingDone:
+ switch uniIndication {
+ // no activity requested (should normally not be received) - just continue waiting
+ case cWaitReconcileFlowNoActivity:
+ // waiting on channel inputs from VlanConfig for all UNI's to be aborted on error condition
+ case cWaitReconcileFlowAbortOnError:
+ logger.Debugw(ctx, "waitReconcileFlow aborted on error",
+ log.Fields{"device-id": dh.DeviceID, "rxEntries": reconciledUniVlanConfigEntries})
+ return
+ // waiting on channel inputs from VlanConfig for all UNI's to be aborted on success condition
+ case cWaitReconcileFlowAbortOnSuccess:
+ logger.Debugw(ctx, "waitReconcileFlow aborted on success",
+ log.Fields{"device-id": dh.DeviceID, "rxEntries": reconciledUniVlanConfigEntries})
+ return
+ // this should be a valid UNI vlan config done indication
+ default:
+ if uniIndication < platform.MaxUnisPerOnu {
+ logger.Debugw(ctx, "reconciling flows has been finished in time for this UNI",
+ log.Fields{"device-id": dh.DeviceID, "uni-id": uniIndication})
+ if reconciledUniVlanConfigEntries, appended =
+ dh.appendIfMissing(reconciledUniVlanConfigEntries, uint8(uniIndication)); appended {
+ waitGroup.Done()
+ }
+ } else {
+ logger.Errorw(ctx, "received unexpected UNI flowConfig done indication - is ignored",
+ log.Fields{"device-id": dh.DeviceID, "uni-id": uniIndication})
+ }
+ } //switch uniIndication
+
+ case <-time.After(expiry): //a bit longer than reconcileExpiryVlanConfig
+ logger.Errorw(ctx, "timeout waiting for reconciling all UNI flows to be finished!",
+ log.Fields{"device-id": dh.DeviceID})
+ return
+ }
+ }
+}
+
+func (dh *deviceHandler) GetReconcileExpiryVlanConfigAbort() time.Duration {
+ return dh.reconcileExpiryVlanConfig + (500 * time.Millisecond)
+}
+
+func (dh *deviceHandler) appendIfMissing(slice []uint8, val uint8) ([]uint8, bool) {
+ for _, ele := range slice {
+ if ele == val {
+ return slice, false
+ }
+ }
+ return append(slice, val), true
+}
+
+// sendChReconcileFinished - sends true or false on reconcileFinish channel
+func (dh *deviceHandler) sendChReconcileFinished(success bool) {
+ if dh != nil { //if the object still exists (might have been already deleted in background)
+ //use asynchronous channel sending to avoid stucking on non-waiting receiver
+ select {
+ case dh.chReconcilingFinished <- success:
+ default:
+ }
+ }
+}
+
+// SendChUniVlanConfigFinished - sends the Uni number on channel if the flow reconcilement for this UNI is finished
+func (dh *deviceHandler) SendChUniVlanConfigFinished(value uint16) {
+ if dh != nil { //if the object still exists (might have been already deleted in background)
+ //use asynchronous channel sending to avoid stucking on non-waiting receiver
+ select {
+ case dh.chUniVlanConfigReconcilingDone <- value:
+ default:
+ }
+ }
+}
+
func (dh *deviceHandler) reconcileEnd(ctx context.Context) {
logger.Debugw(ctx, "reconciling - completed!", log.Fields{"device-id": dh.DeviceID})
- dh.StopReconciling(ctx, true)
+ dh.stopReconciling(ctx, true, cWaitReconcileFlowNoActivity)
}
func (dh *deviceHandler) deleteDevicePersistencyData(ctx context.Context) error {
@@ -1821,7 +1980,7 @@
pDevEntry.MutexPersOnuConfig.RUnlock()
logger.Debugw(ctx, "reconciling - uni-ports were not unlocked before adapter restart - resume with a normal start-up",
log.Fields{"device-id": dh.DeviceID})
- dh.StopReconciling(ctx, true)
+ dh.stopReconciling(ctx, true, cWaitReconcileFlowNoActivity)
} else {
pDevEntry.MutexPersOnuConfig.RUnlock()
}
@@ -2068,6 +2227,8 @@
if pMibDlFsm != nil {
_ = pMibDlFsm.Event(mib.DlEvReset)
}
+ //stop any deviceHandler reconcile processing (if running)
+ dh.stopReconciling(ctx, false, cWaitReconcileFlowAbortOnError)
//port lock/unlock FSM's may be active
if dh.pUnlockStateFsm != nil {
_ = dh.pUnlockStateFsm.PAdaptFsm.PFsm.Event(uniprt.UniEvReset)
@@ -3819,7 +3980,7 @@
if !dh.IsReconciling() {
go func() {
logger.Debugw(ctx, "wait for channel signal or timeout",
- log.Fields{"timeout": dh.pOpenOnuAc.maxTimeoutReconciling, "device-id": dh.DeviceID})
+ log.Fields{"timeout": dh.reconcileExpiryComplete, "device-id": dh.DeviceID})
select {
case success := <-dh.chReconcilingFinished:
if success {
@@ -3867,7 +4028,7 @@
dh.deviceReconcileFailedUpdate(ctx, cmn.DrReconcileCanceled, connectStatus)
}
- case <-time.After(dh.pOpenOnuAc.maxTimeoutReconciling):
+ case <-time.After(dh.reconcileExpiryComplete):
logger.Errorw(ctx, "timeout waiting for reconciling to be finished!",
log.Fields{"device-id": dh.DeviceID})
@@ -3895,12 +4056,15 @@
dh.mutexReconcilingFlag.Unlock()
}
-func (dh *deviceHandler) StopReconciling(ctx context.Context, success bool) {
+func (dh *deviceHandler) stopReconciling(ctx context.Context, success bool, reconcileFlowResult uint16) {
logger.Debugw(ctx, "stop reconciling", log.Fields{"device-id": dh.DeviceID, "success": success})
if dh.IsReconciling() {
- dh.chReconcilingFinished <- success
+ dh.sendChReconcileFinished(success)
+ if reconcileFlowResult != cWaitReconcileFlowNoActivity {
+ dh.SendChUniVlanConfigFinished(reconcileFlowResult)
+ }
} else {
- logger.Infow(ctx, "reconciling is not running", log.Fields{"device-id": dh.DeviceID})
+ logger.Debugw(ctx, "nothing to stop - reconciling is not running", log.Fields{"device-id": dh.DeviceID})
}
}