Deploy alerts in the Helm chart
Change-Id: Idee92f135fa3cbc8694eed034e4dfbb32fd0db4d
diff --git a/monitoring/edge-monitoring-server/Chart.yaml b/monitoring/edge-monitoring-server/Chart.yaml
index 8d3e70a..bf5f414 100644
--- a/monitoring/edge-monitoring-server/Chart.yaml
+++ b/monitoring/edge-monitoring-server/Chart.yaml
@@ -8,7 +8,7 @@
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.3.5
+version: 0.4.0
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
diff --git a/monitoring/edge-monitoring-server/templates/prometheusrule.yaml b/monitoring/edge-monitoring-server/templates/prometheusrule.yaml
new file mode 100644
index 0000000..4d5cc18
--- /dev/null
+++ b/monitoring/edge-monitoring-server/templates/prometheusrule.yaml
@@ -0,0 +1,54 @@
+# Copyright 2020-present Open Networking Foundation
+# SPDX-License-Identifier: LicenseRef-ONF-Member-Only-1.0
+
+{{- if .Values.alerts.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: {{ include "edge-monitoring-server.fullname" . }}
+ labels:
+ {{- include "edge-monitoring-server.labels" . | nindent 4 }}
+spec:
+ groups:
+ - name: ace-e2e-tests-v2.rules
+ rules:
+ - alert: ScheduledDowntime
+ annotations:
+ message: The cluster {{`{{ .Labels.name }}`}} is undergoing scheduled maintenance.
+ expr: aetheredge_in_maintenance_window{endpoint="metrics80"} > 0
+ for: 1m
+ labels:
+ severity: info
+ - alert: SingleEdgeTestNotReporting
+ annotations:
+ message: |
+ The E2E test on cluster {{`{{ .Labels.name }}`}} has not reported results for at least 10 minutes.
+ expr: (time() - aetheredge_last_update{endpoint="metrics80"}) > 600
+ for: 1m
+ labels:
+ severity: critical
+ - alert: SingleEdgeConnectTestFailing
+ annotations:
+ message: |
+ The E2E test on cluster {{`{{ .Labels.name }}`}} is reporting UE connect failure for at least 10 minutes.
+ expr: aetheredge_connect_test_ok{endpoint="metrics80"} < 1
+ for: 10m
+ labels:
+ severity: critical
+ - alert: SingleEdgePingTestFailing
+ annotations:
+ message: |
+ The E2E test on cluster {{`{{ .Labels.name }}`}} is reporting that UE cannot ping the Internet for at least 10 minutes.
+ expr: aetheredge_ping_test_ok{endpoint="metrics80"} < 1
+ for: 10m
+ labels:
+ severity: critical
+ - alert: ManyEdgeConnectTestsFailing
+ annotations:
+ message: |
+ {{`{{ $value | humanizePercentage }}`}} of the clusters are reporting UE connect failures.
+ expr: sum(aetheredge_connect_test_ok{endpoint="metrics80"}) / count(aetheredge_connect_test_ok{endpoint="metrics80"}) < 0.5
+ for: 1m
+ labels:
+ severity: critical
+{{- end }}
diff --git a/monitoring/edge-monitoring-server/values.yaml b/monitoring/edge-monitoring-server/values.yaml
index fa1a468..e013840 100644
--- a/monitoring/edge-monitoring-server/values.yaml
+++ b/monitoring/edge-monitoring-server/values.yaml
@@ -19,6 +19,9 @@
# Name of environment that server is monitoring (e.g., "staging")
aetherEnv: "production"
+alerts:
+ enabled: false
+
imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""