blob: fc29f2a066d56d2194dcbfd02d975245a26cb1e9 [file] [log] [blame]
Andy Bavier0c586ca2021-03-12 14:36:40 -07001# Copyright 2020-present Open Networking Foundation
2# SPDX-License-Identifier: LicenseRef-ONF-Member-Only-1.0
3
4{{- if .Values.alerts.enabled }}
5apiVersion: monitoring.coreos.com/v1
6kind: PrometheusRule
7metadata:
8 name: {{ include "edge-monitoring-server.fullname" . }}
9 labels:
10 {{- include "edge-monitoring-server.labels" . | nindent 4 }}
11spec:
12 groups:
13 - name: ace-e2e-tests-v2.rules
14 rules:
15 - alert: ScheduledDowntime
16 annotations:
17 message: The cluster {{`{{ .Labels.name }}`}} is undergoing scheduled maintenance.
18 expr: aetheredge_in_maintenance_window{endpoint="metrics80"} > 0
19 for: 1m
20 labels:
21 severity: info
22 - alert: SingleEdgeTestNotReporting
23 annotations:
24 message: |
25 The E2E test on cluster {{`{{ .Labels.name }}`}} has not reported results for at least 10 minutes.
26 expr: (time() - aetheredge_last_update{endpoint="metrics80"}) > 600
27 for: 1m
28 labels:
29 severity: critical
30 - alert: SingleEdgeConnectTestFailing
31 annotations:
32 message: |
33 The E2E test on cluster {{`{{ .Labels.name }}`}} is reporting UE connect failure for at least 10 minutes.
34 expr: aetheredge_connect_test_ok{endpoint="metrics80"} < 1
35 for: 10m
36 labels:
37 severity: critical
38 - alert: SingleEdgePingTestFailing
39 annotations:
40 message: |
41 The E2E test on cluster {{`{{ .Labels.name }}`}} is reporting that UE cannot ping the Internet for at least 10 minutes.
42 expr: aetheredge_ping_test_ok{endpoint="metrics80"} < 1
43 for: 10m
44 labels:
45 severity: critical
46 - alert: ManyEdgeConnectTestsFailing
47 annotations:
48 message: |
49 {{`{{ $value | humanizePercentage }}`}} of the clusters are reporting UE connect failures.
Andy Bavier19002902021-03-12 15:13:10 -070050 expr: avg(aetheredge_connect_test_ok{endpoint="metrics80"}) < 0.5
51 for: 10m
Andy Bavier0c586ca2021-03-12 14:36:40 -070052 labels:
53 severity: critical
54{{- end }}