Deploy alerts in the Helm chart
Change-Id: Idee92f135fa3cbc8694eed034e4dfbb32fd0db4d
diff --git a/monitoring/edge-monitoring-server/templates/prometheusrule.yaml b/monitoring/edge-monitoring-server/templates/prometheusrule.yaml
new file mode 100644
index 0000000..4d5cc18
--- /dev/null
+++ b/monitoring/edge-monitoring-server/templates/prometheusrule.yaml
@@ -0,0 +1,54 @@
+# Copyright 2020-present Open Networking Foundation
+# SPDX-License-Identifier: LicenseRef-ONF-Member-Only-1.0
+
+{{- if .Values.alerts.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ include "edge-monitoring-server.fullname" . }}
+ labels:
+ {{- include "edge-monitoring-server.labels" . | nindent 4 }}
+spec:
+ groups:
+ - name: ace-e2e-tests-v2.rules
+ rules:
+ - alert: ScheduledDowntime
+ annotations:
+ message: The cluster {{`{{ .Labels.name }}`}} is undergoing scheduled maintenance.
+ expr: aetheredge_in_maintenance_window{endpoint="metrics80"} > 0
+ for: 1m
+ labels:
+ severity: info
+ - alert: SingleEdgeTestNotReporting
+ annotations:
+ message: |
+ The E2E test on cluster {{`{{ .Labels.name }}`}} has not reported results for at least 10 minutes.
+ expr: (time() - aetheredge_last_update{endpoint="metrics80"}) > 600
+ for: 1m
+ labels:
+ severity: critical
+ - alert: SingleEdgeConnectTestFailing
+ annotations:
+ message: |
+ The E2E test on cluster {{`{{ .Labels.name }}`}} is reporting UE connect failure for at least 10 minutes.
+ expr: aetheredge_connect_test_ok{endpoint="metrics80"} < 1
+ for: 10m
+ labels:
+ severity: critical
+ - alert: SingleEdgePingTestFailing
+ annotations:
+ message: |
+ The E2E test on cluster {{`{{ .Labels.name }}`}} is reporting that UE cannot ping the Internet for at least 10 minutes.
+ expr: aetheredge_ping_test_ok{endpoint="metrics80"} < 1
+ for: 10m
+ labels:
+ severity: critical
+ - alert: ManyEdgeConnectTestsFailing
+ annotations:
+ message: |
+ {{`{{ $value | humanizePercentage }}`}} of the clusters are reporting UE connect failures.
+ expr: sum(aetheredge_connect_test_ok{endpoint="metrics80"}) / count(aetheredge_connect_test_ok{endpoint="metrics80"}) < 0.5
+ for: 1m
+ labels:
+ severity: critical
+{{- end }}