blob: 4ea3e6e6b8ae3ea56b4e2405f2c764d849af1985 [file] [log] [blame]
Andy Bavier0c586ca2021-03-12 14:36:40 -07001# Copyright 2020-present Open Networking Foundation
Scott Baker4cad5ee2022-06-09 14:17:35 -07002# SPDX-License-Identifier: Apache-2.0
Andy Bavier0c586ca2021-03-12 14:36:40 -07003
4{{- if .Values.alerts.enabled }}
5apiVersion: monitoring.coreos.com/v1
6kind: PrometheusRule
7metadata:
8 name: {{ include "edge-monitoring-server.fullname" . }}
9 labels:
10 {{- include "edge-monitoring-server.labels" . | nindent 4 }}
11spec:
12 groups:
13 - name: ace-e2e-tests-v2.rules
14 rules:
15 - alert: ScheduledDowntime
16 annotations:
17 message: The cluster {{`{{ .Labels.name }}`}} is undergoing scheduled maintenance.
18 expr: aetheredge_in_maintenance_window{endpoint="metrics80"} > 0
19 for: 1m
20 labels:
21 severity: info
22 - alert: SingleEdgeTestNotReporting
23 annotations:
24 message: |
Andy Bavier40e72fa2021-03-19 10:50:33 -070025 The E2E test on cluster {{`{{ .Labels.name }}`}} has not reported results for at least 5 minutes.
26 expr: (time() - aetheredge_last_update{endpoint="metrics80"}) > 300
Andy Bavier0c586ca2021-03-12 14:36:40 -070027 for: 1m
28 labels:
29 severity: critical
30 - alert: SingleEdgeConnectTestFailing
31 annotations:
32 message: |
33 The E2E test on cluster {{`{{ .Labels.name }}`}} is reporting UE connect failure for at least 10 minutes.
34 expr: aetheredge_connect_test_ok{endpoint="metrics80"} < 1
35 for: 10m
36 labels:
37 severity: critical
38 - alert: SingleEdgePingTestFailing
39 annotations:
40 message: |
41 The E2E test on cluster {{`{{ .Labels.name }}`}} is reporting that UE cannot ping the Internet for at least 10 minutes.
42 expr: aetheredge_ping_test_ok{endpoint="metrics80"} < 1
Andy Bavier40e72fa2021-03-19 10:50:33 -070043 for: 11m
Andy Bavier0c586ca2021-03-12 14:36:40 -070044 labels:
45 severity: critical
Andy Bavier0c83a862021-03-17 10:18:44 -070046{{- if .Values.alerts.manyEdgeConnectTestsFailing }}
Andy Bavier0c586ca2021-03-12 14:36:40 -070047 - alert: ManyEdgeConnectTestsFailing
48 annotations:
49 message: |
Andy Bavier0c83a862021-03-17 10:18:44 -070050 Over half of the clusters are reporting UE connect failures.
Andy Bavier8c757112021-03-15 11:16:33 -070051 expr: avg(clamp_max(aetheredge_connect_test_ok{endpoint="metrics80"} + aetheredge_in_maintenance_window{endpoint="metrics80"}, 1)) < 0.5
Andy Bavier40e72fa2021-03-19 10:50:33 -070052 for: 5m
Andy Bavier0c586ca2021-03-12 14:36:40 -070053 labels:
54 severity: critical
55{{- end }}
Andy Bavier0c83a862021-03-17 10:18:44 -070056{{- end }}