Tweak alert timers so that alert suppression works better
Change-Id: I3f6146dbf7ec8688ffa8fc4aa9b6348bc9e7a22b
diff --git a/monitoring/edge-monitoring-server/Chart.yaml b/monitoring/edge-monitoring-server/Chart.yaml
index 388d8ae..f00639f 100644
--- a/monitoring/edge-monitoring-server/Chart.yaml
+++ b/monitoring/edge-monitoring-server/Chart.yaml
@@ -8,7 +8,7 @@
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.4.3
+version: 0.4.4
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
diff --git a/monitoring/edge-monitoring-server/templates/prometheusrule.yaml b/monitoring/edge-monitoring-server/templates/prometheusrule.yaml
index d554431..16d5f52 100644
--- a/monitoring/edge-monitoring-server/templates/prometheusrule.yaml
+++ b/monitoring/edge-monitoring-server/templates/prometheusrule.yaml
@@ -22,8 +22,8 @@
- alert: SingleEdgeTestNotReporting
annotations:
message: |
- The E2E test on cluster {{`{{ .Labels.name }}`}} has not reported results for at least 10 minutes.
- expr: (time() - aetheredge_last_update{endpoint="metrics80"}) > 600
+ The E2E test on cluster {{`{{ .Labels.name }}`}} has not reported results for at least 5 minutes.
+ expr: (time() - aetheredge_last_update{endpoint="metrics80"}) > 300
for: 1m
labels:
severity: critical
@@ -40,7 +40,7 @@
message: |
The E2E test on cluster {{`{{ .Labels.name }}`}} is reporting that UE cannot ping the Internet for at least 10 minutes.
expr: aetheredge_ping_test_ok{endpoint="metrics80"} < 1
- for: 10m
+ for: 11m
labels:
severity: critical
{{- if .Values.alerts.manyEdgeConnectTestsFailing }}
@@ -49,7 +49,7 @@
message: |
Over half of the clusters are reporting UE connect failures.
expr: avg(clamp_max(aetheredge_connect_test_ok{endpoint="metrics80"} + aetheredge_in_maintenance_window{endpoint="metrics80"}, 1)) < 0.5
- for: 10m
+ for: 5m
labels:
severity: critical
{{- end }}