VOL-5113:OnuAdapter crashes Intermittently while processing onu indication during scale tests
RCA:1) The ONU discovery is successfully completed.
2) As part of the ONU Indication processing , during creating the interface an OMCI test is performed before we start MIB upload process.
3) OMCI requests are sent and while we wait for the response we maintain a timeout.
4) Simultaneously , a delete request for this device has triggered clean
up of the device objects , the ONU indication goroutine wakes up after timeout in a case when the OLT is unable to process the OMCI requests
and acccess an invalid object causing a segmentation fault.
Change-Id: Ifcb64d86854ddb4e2d4857897cc9789128410015
diff --git a/VERSION b/VERSION
index 9462efa..22ac5d0 100755
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.11.8
+2.11.9
diff --git a/internal/pkg/core/device_handler.go b/internal/pkg/core/device_handler.go
index 33ba76e..db394cb 100755
--- a/internal/pkg/core/device_handler.go
+++ b/internal/pkg/core/device_handler.go
@@ -231,6 +231,7 @@
isFlowMonitoringRoutineActive []bool // length of slice equal to number of uni ports
disableDeviceRequested bool // this flag identify ONU received disable request or not
oltAvailable bool
+ deviceDeleteCommChan chan bool
}
// newDeviceHandler creates a new device handler
@@ -283,6 +284,7 @@
ImageState: voltha.ImageState_IMAGE_UNKNOWN,
}
dh.upgradeFsmChan = make(chan struct{})
+ dh.deviceDeleteCommChan = make(chan bool, 2)
if dh.device.PmConfigs != nil { // can happen after onu adapter restart
dh.pmConfigs = cloned.PmConfigs
@@ -2046,6 +2048,9 @@
logger.Warnw(ctx, "omci start-verification timed out (continue normal)", log.Fields{"device-id": dh.DeviceID})
case testresult := <-verifyExec:
logger.Infow(ctx, "Omci start verification done", log.Fields{"device-id": dh.DeviceID, "result": testresult})
+ case <-dh.deviceDeleteCommChan:
+ logger.Warnw(ctx, "Deleting device, stopping the omci test activity", log.Fields{"device-id": dh.DeviceID})
+ return nil
}
/* In py code it looks earlier (on activate ..)
diff --git a/internal/pkg/core/openonu.go b/internal/pkg/core/openonu.go
index 28c9a8d..304cb21 100755
--- a/internal/pkg/core/openonu.go
+++ b/internal/pkg/core/openonu.go
@@ -339,6 +339,8 @@
if handler.pOnuMetricsMgr != nil {
handler.pOnuMetricsMgr.SetdeviceDeletionInProgress(true)
}
+
+ handler.deviceDeleteCommChan <- true
if err := handler.resetFsms(ctx, true); err != nil {
errorsList = append(errorsList, err)
}
diff --git a/internal/pkg/pmmgr/onu_metrics_manager.go b/internal/pkg/pmmgr/onu_metrics_manager.go
index 43ab2a0..03f7af7 100755
--- a/internal/pkg/pmmgr/onu_metrics_manager.go
+++ b/internal/pkg/pmmgr/onu_metrics_manager.go
@@ -333,6 +333,8 @@
maxL2PMGetPayLoadSize int
onuOpticalMetricstimer *time.Timer
onuUniStatusMetricstimer *time.Timer
+ opticalMetricsDelCommChan chan bool
+ uniMetricsDelCommChan chan bool
}
// NewOnuMetricsManager returns a new instance of the NewOnuMetricsManager
@@ -416,6 +418,9 @@
metricsManager.onuUniStatusMetricstimer = time.NewTimer(DefaultMetricCollectionFrequency)
metricsManager.onuUniStatusMetricstimer.Stop()
+ metricsManager.opticalMetricsDelCommChan = make(chan bool, 2)
+ metricsManager.uniMetricsDelCommChan = make(chan bool, 2)
+
logger.Info(ctx, "init-OnuMetricsManager completed", log.Fields{"device-id": metricsManager.deviceID})
return &metricsManager
}
@@ -785,6 +790,9 @@
logger.Errorw(ctx, "timeout waiting for omci-get response for optical metrics", log.Fields{"device-id": mm.deviceID})
// The metrics will be empty in this case
break loop
+ case <-mm.opticalMetricsDelCommChan:
+ logger.Warnw(ctx, "Deleting the device, stopping optical metrics collection for the device ", log.Fields{"device-id": mm.deviceID})
+ return nil, err
}
// Populate metric only if it was enabled.
for k := range OpticalPowerGroupMetrics {
@@ -871,6 +879,9 @@
logger.Errorw(ctx, "timeout waiting for omci-get response for uni status", log.Fields{"device-id": mm.deviceID})
// The metrics could be empty in this case
break loop1
+ case <-mm.uniMetricsDelCommChan:
+ logger.Warnw(ctx, "Deleting the device, stopping UniMetrics collection for the device ", log.Fields{"device-id": mm.deviceID})
+ return nil, err
}
// Populate metric only if it was enabled.
for k := range UniStatusGroupMetrics {
@@ -933,6 +944,9 @@
logger.Errorw(ctx, "timeout waiting for omci-get response for uni status", log.Fields{"device-id": mm.deviceID})
// The metrics could be empty in this case
break loop2
+ case <-mm.uniMetricsDelCommChan:
+ logger.Warnw(ctx, "Deleting the device, stopping UniMetrics collection for the device ", log.Fields{"device-id": mm.deviceID})
+ return nil, err
}
// Populate metric only if it was enabled.
@@ -1002,6 +1016,9 @@
logger.Errorw(ctx, "timeout waiting for omci-get response for uni status", log.Fields{"device-id": mm.deviceID})
// The metrics could be empty in this case
break loop3
+ case <-mm.uniMetricsDelCommChan:
+ logger.Warnw(ctx, "Deleting the device, stopping UniMetrics collection for the device ", log.Fields{"device-id": mm.deviceID})
+ return nil, err
}
// Populate metric only if it was enabled.
@@ -1415,6 +1432,10 @@
}
if mm.GetdeviceDeletionInProgress() {
+ mm.onuOpticalMetricstimer.Stop()
+ mm.onuUniStatusMetricstimer.Stop()
+ mm.opticalMetricsDelCommChan <- true
+ mm.uniMetricsDelCommChan <- true
mm.pDeviceHandler = nil
mm.pOnuDeviceEntry = nil
mm.GarbageCollectionComplete <- true