blob: 4ea3e6e6b8ae3ea56b4e2405f2c764d849af1985 [file] [log] [blame]
# Copyright 2020-present Open Networking Foundation
# SPDX-License-Identifier: Apache-2.0
{{- if .Values.alerts.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ include "edge-monitoring-server.fullname" . }}
labels:
{{- include "edge-monitoring-server.labels" . | nindent 4 }}
spec:
groups:
- name: ace-e2e-tests-v2.rules
rules:
- alert: ScheduledDowntime
annotations:
message: The cluster {{`{{ .Labels.name }}`}} is undergoing scheduled maintenance.
expr: aetheredge_in_maintenance_window{endpoint="metrics80"} > 0
for: 1m
labels:
severity: info
- alert: SingleEdgeTestNotReporting
annotations:
message: |
The E2E test on cluster {{`{{ .Labels.name }}`}} has not reported results for at least 5 minutes.
expr: (time() - aetheredge_last_update{endpoint="metrics80"}) > 300
for: 1m
labels:
severity: critical
- alert: SingleEdgeConnectTestFailing
annotations:
message: |
The E2E test on cluster {{`{{ .Labels.name }}`}} is reporting UE connect failure for at least 10 minutes.
expr: aetheredge_connect_test_ok{endpoint="metrics80"} < 1
for: 10m
labels:
severity: critical
- alert: SingleEdgePingTestFailing
annotations:
message: |
The E2E test on cluster {{`{{ .Labels.name }}`}} is reporting that UE cannot ping the Internet for at least 10 minutes.
expr: aetheredge_ping_test_ok{endpoint="metrics80"} < 1
for: 11m
labels:
severity: critical
{{- if .Values.alerts.manyEdgeConnectTestsFailing }}
- alert: ManyEdgeConnectTestsFailing
annotations:
message: |
Over half of the clusters are reporting UE connect failures.
expr: avg(clamp_max(aetheredge_connect_test_ok{endpoint="metrics80"} + aetheredge_in_maintenance_window{endpoint="metrics80"}, 1)) < 0.5
for: 5m
labels:
severity: critical
{{- end }}
{{- end }}