-
Notifications
You must be signed in to change notification settings - Fork 159
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[sync] prometheus-unit-tests from rhoai
- Loading branch information
Showing
17 changed files
with
1,340 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
name: Run prometheus unit tests | ||
on: | ||
pull_request: | ||
paths: | ||
- 'config/monitoring/prometheus/**' | ||
- 'tests/prometheus_unit_tests/**' | ||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v4 | ||
- name: Setup Go | ||
uses: actions/setup-go@v4 | ||
with: | ||
go-version-file: go.mod | ||
- name: Install Promtool | ||
run: | | ||
sudo apt-get update && sudo apt-get install -y prometheus | ||
- name: Run prometheus-unit-tests | ||
run : make test-alerts |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,4 +62,3 @@ local.mk | |
|
||
# Ignore temporary files created by the Makefile | ||
*.mktmp.* | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Ignore temporary alert yaml files created by the Makefile | ||
*.rules.yaml |
129 changes: 129 additions & 0 deletions
129
tests/prometheus_unit_tests/codeflare-alerting.unit-tests.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
rule_files: | ||
- codeflare-alerting.rules.yaml | ||
|
||
evaluation_interval: 1m | ||
|
||
tests: | ||
# burn rate | ||
- interval: 1m | ||
input_series: | ||
- series: probe_success:burnrate5m{instance="codeflare-operator"} | ||
values: "0x60" | ||
- series: probe_success:burnrate30m{instance="codeflare-operator"} | ||
values: "0x60" | ||
- series: probe_success:burnrate1h{instance="codeflare-operator"} | ||
values: "0x60" | ||
- series: probe_success:burnrate2h{instance="codeflare-operator"} | ||
values: "0x60" | ||
- series: probe_success:burnrate6h{instance="codeflare-operator"} | ||
values: "0x60" | ||
- series: probe_success:burnrate1d{instance="codeflare-operator"} | ||
values: "0x60" | ||
alert_rule_test: | ||
- eval_time: 1h | ||
alertname: CodeFlare Operator Probe Success Burn Rate | ||
exp_alerts: [] | ||
|
||
- interval: 1m | ||
input_series: | ||
- series: probe_success:burnrate5m{instance="codeflare-operator"} | ||
values: "1+1x60" | ||
- series: probe_success:burnrate1h{instance="codeflare-operator"} | ||
values: "1+1x60" | ||
alert_rule_test: | ||
- eval_time: 2m | ||
alertname: CodeFlare Operator Probe Success Burn Rate | ||
exp_alerts: | ||
- exp_labels: | ||
alertname: CodeFlare Operator Probe Success Burn Rate | ||
instance: "codeflare-operator" | ||
namespace: "redhat-ods-applications" | ||
severity: info | ||
exp_annotations: | ||
message: "High error budget burn for codeflare-operator (current value: 3)." | ||
summary: CodeFlare Operator Probe Success Burn Rate | ||
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-availability.md' | ||
|
||
- interval: 1m | ||
input_series: | ||
- series: probe_success:burnrate30m{instance="codeflare-operator"} | ||
values: "1+1x60" | ||
- series: probe_success:burnrate6h{instance="codeflare-operator"} | ||
values: "1+1x60" | ||
alert_rule_test: | ||
- eval_time: 15m | ||
alertname: CodeFlare Operator Probe Success Burn Rate | ||
exp_alerts: | ||
- exp_labels: | ||
alertname: CodeFlare Operator Probe Success Burn Rate | ||
instance: "codeflare-operator" | ||
namespace: "redhat-ods-applications" | ||
severity: info | ||
exp_annotations: | ||
message: "High error budget burn for codeflare-operator (current value: 16)." | ||
summary: CodeFlare Operator Probe Success Burn Rate | ||
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-probe-success-burn-rate.md' | ||
|
||
- interval: 1m | ||
input_series: | ||
- series: probe_success:burnrate2h{instance="codeflare-operator"} | ||
values: "1+1x60" | ||
- series: probe_success:burnrate1d{instance="codeflare-operator"} | ||
values: "1+1x60" | ||
alert_rule_test: | ||
- eval_time: 1h | ||
alertname: CodeFlare Operator Probe Success Burn Rate | ||
exp_alerts: | ||
- exp_labels: | ||
alertname: CodeFlare Operator Probe Success Burn Rate | ||
instance: "codeflare-operator" | ||
namespace: "redhat-ods-applications" | ||
severity: info | ||
exp_annotations: | ||
message: "High error budget burn for codeflare-operator (current value: 61)." | ||
summary: CodeFlare Operator Probe Success Burn Rate | ||
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-probe-success-burn-rate.md' | ||
|
||
# operator running | ||
- interval: 1m | ||
input_series: | ||
- series: up{job="CodeFlare Operator"} | ||
values: 1 | ||
alert_rule_test: | ||
- eval_time: 1m | ||
alertname: CodeFlare Operator is not running | ||
exp_alerts: [] | ||
|
||
- interval: 1m | ||
input_series: | ||
- series: up{job="CodeFlare Operator"} | ||
values: 0 | ||
alert_rule_test: | ||
- eval_time: 1m | ||
alertname: CodeFlare Operator is not running | ||
exp_alerts: | ||
- exp_labels: | ||
alertname: CodeFlare Operator is not running | ||
job: "CodeFlare Operator" | ||
namespace: "redhat-ods-applications" | ||
severity: info | ||
exp_annotations: | ||
description: This alert fires when the CodeFlare Operator is not running. | ||
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-availability.md' | ||
summary: Alerting for CodeFlare Operator | ||
|
||
- interval: 1m | ||
input_series: | ||
alert_rule_test: | ||
- eval_time: 2m | ||
alertname: CodeFlare Operator taking too long to be up | ||
exp_alerts: | ||
- exp_labels: | ||
alertname: CodeFlare Operator taking too long to be up | ||
namespace: "redhat-ods-applications" | ||
job: "CodeFlare Operator" | ||
severity: info | ||
exp_annotations: | ||
description: This alert fires when the CodeFlare Operator takes over 2 min. to come back online. Either CodeFlare Operator is not running and failing to become ready, is misconfigured, or the metrics endpoint is not responding. | ||
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-absent-over-time.md' | ||
summary: Alerting for CodeFlare Operator |
Oops, something went wrong.