Skip to content

Commit

Permalink
[sync] prometheus-unit-tests from rhoai
Browse files Browse the repository at this point in the history
  • Loading branch information
biswassri committed Feb 12, 2025
1 parent 39da621 commit 4fbd91e
Show file tree
Hide file tree
Showing 17 changed files with 1,340 additions and 3 deletions.
21 changes: 21 additions & 0 deletions .github/workflows/prometheus-unit-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: Run prometheus unit tests
on:
pull_request:
paths:
- 'config/monitoring/prometheus/**'
- 'tests/prometheus_unit_tests/**'
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Go
uses: actions/setup-go@v4
with:
go-version-file: go.mod
- name: Install Promtool
run: |
sudo apt-get update && sudo apt-get install -y prometheus
- name: Run prometheus-unit-tests
run : make test-alerts
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,3 @@ local.mk

# Ignore temporary files created by the Makefile
*.mktmp.*

24 changes: 24 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,14 @@ IMAGE_BUILD_FLAGS ?= --build-arg USE_LOCAL=$(USE_LOCAL)
IMAGE_BUILD_FLAGS += --build-arg CGO_ENABLED=$(CGO_ENABLED)
IMAGE_BUILD_FLAGS += --platform $(PLATFORM)

# Prometheus-Unit Tests Parameters
PROMETHEUS_CONFIG_YAML = ./config/monitoring/prometheus/apps/prometheus-configs.yaml
PROMETHEUS_CONFIG_DIR = ./config/monitoring/prometheus/apps
PROMETHEUS_TEST_DIR = ./tests/prometheus_unit_tests
PROMETHEUS_ALERT_TESTS = $(wildcard $(PROMETHEUS_TEST_DIR)/*.unit-tests.yaml)

ALERT_SEVERITY = critical

# Read any custom variables overrides from a local.mk file. This will only be read if it exists in the
# same directory as this Makefile. Variables can be specified in the standard format supported by
# GNU Make since `include` processes any valid Makefile
Expand Down Expand Up @@ -390,6 +398,22 @@ unit-test: envtest
OPERATOR_NAMESPACE=$(OPERATOR_NAMESPACE) KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $(TEST_SRC) -v -coverprofile cover.out
CLEANFILES += cover.out

$(PROMETHEUS_TEST_DIR)/%.rules.yaml: $(PROMETHEUS_TEST_DIR)/%.unit-tests.yaml $(PROMETHEUS_CONFIG_YAML) $(YQ)
$(YQ) eval ".data.\"$(@F:.rules.yaml=.rules)\"" $(PROMETHEUS_CONFIG_YAML) > $@

PROMETHEUS_ALERT_RULES := $(PROMETHEUS_ALERT_TESTS:.unit-tests.yaml=.rules.yaml)

# Run prometheus-alert-unit-tests
.PHONY: test-alerts
test-alerts: $(PROMETHEUS_ALERT_RULES)
promtool test rules $(PROMETHEUS_ALERT_TESTS)

#Check for alerts without unit-tests
.PHONY: check-prometheus-alert-unit-tests
check-prometheus-alert-unit-tests: $(PROMETHEUS_ALERT_RULES)
./tests/prometheus_unit_tests/scripts/check_alert_tests.sh $(PROMETHEUS_CONFIG_YAML) $(PROMETHEUS_TEST_DIR) $(ALERT_SEVERITY)
CLEANFILES += $(PROMETHEUS_ALERT_RULES)

.PHONY: e2e-test
e2e-test: ## Run e2e tests for the controller
go test ./tests/e2e/ -run ^TestOdhOperator -v --operator-namespace=${OPERATOR_NAMESPACE} ${E2E_TEST_FLAGS}
Expand Down
19 changes: 17 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ e.g `make image-build USE_LOCAL=true"`
**Deploying operator using OLM**

- To create a new bundle in defined operator namespace, run following command:

```commandline
export OPERATOR_NAMESPACE=<namespace-to-install-operator>
make bundle
Expand Down Expand Up @@ -239,7 +239,7 @@ There are 2 ways to test your changes with modification:

Whenever a new api is added or a new field is added to the CRD, please make sure to run the command:
```commandline
make api-docs
make api-docs
```
This will ensure that the doc for the apis are updated accordingly.

Expand Down Expand Up @@ -431,6 +431,21 @@ make run-nowebhook
make e2e-test -e OPERATOR_NAMESPACE=<namespace> -e E2E_TEST_FLAGS="--test-operator-controller=false --test-webhook=false --test-component=dashboard"
```

## Run Prometheus Unit Tests for Alerts

Unit tests for Prometheus alerts are included in the repository. You can run them using the following command:

```shell
make test-alerts
```

To check for alerts that don't have unit tests, run the below command:

```shell
make check-prometheus-alert-unit-tests
```

To add a new unit test file, name it the same as the rules file in the [prometheus ConfigMap](./config/monitoring/prometheus/apps/prometheus-configs.yaml), just with the `.rules` suffix replaced with `.unit-tests.yaml`

### API Overview

Expand Down
2 changes: 2 additions & 0 deletions tests/prometheus_unit_tests/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Ignore temporary alert yaml files created by the Makefile
*.rules.yaml
129 changes: 129 additions & 0 deletions tests/prometheus_unit_tests/codeflare-alerting.unit-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
rule_files:
- codeflare-alerting.rules.yaml

evaluation_interval: 1m

tests:
# burn rate
- interval: 1m
input_series:
- series: probe_success:burnrate5m{instance="codeflare-operator"}
values: "0x60"
- series: probe_success:burnrate30m{instance="codeflare-operator"}
values: "0x60"
- series: probe_success:burnrate1h{instance="codeflare-operator"}
values: "0x60"
- series: probe_success:burnrate2h{instance="codeflare-operator"}
values: "0x60"
- series: probe_success:burnrate6h{instance="codeflare-operator"}
values: "0x60"
- series: probe_success:burnrate1d{instance="codeflare-operator"}
values: "0x60"
alert_rule_test:
- eval_time: 1h
alertname: CodeFlare Operator Probe Success Burn Rate
exp_alerts: []

- interval: 1m
input_series:
- series: probe_success:burnrate5m{instance="codeflare-operator"}
values: "1+1x60"
- series: probe_success:burnrate1h{instance="codeflare-operator"}
values: "1+1x60"
alert_rule_test:
- eval_time: 2m
alertname: CodeFlare Operator Probe Success Burn Rate
exp_alerts:
- exp_labels:
alertname: CodeFlare Operator Probe Success Burn Rate
instance: "codeflare-operator"
namespace: "redhat-ods-applications"
severity: info
exp_annotations:
message: "High error budget burn for codeflare-operator (current value: 3)."
summary: CodeFlare Operator Probe Success Burn Rate
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-availability.md'

- interval: 1m
input_series:
- series: probe_success:burnrate30m{instance="codeflare-operator"}
values: "1+1x60"
- series: probe_success:burnrate6h{instance="codeflare-operator"}
values: "1+1x60"
alert_rule_test:
- eval_time: 15m
alertname: CodeFlare Operator Probe Success Burn Rate
exp_alerts:
- exp_labels:
alertname: CodeFlare Operator Probe Success Burn Rate
instance: "codeflare-operator"
namespace: "redhat-ods-applications"
severity: info
exp_annotations:
message: "High error budget burn for codeflare-operator (current value: 16)."
summary: CodeFlare Operator Probe Success Burn Rate
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-probe-success-burn-rate.md'

- interval: 1m
input_series:
- series: probe_success:burnrate2h{instance="codeflare-operator"}
values: "1+1x60"
- series: probe_success:burnrate1d{instance="codeflare-operator"}
values: "1+1x60"
alert_rule_test:
- eval_time: 1h
alertname: CodeFlare Operator Probe Success Burn Rate
exp_alerts:
- exp_labels:
alertname: CodeFlare Operator Probe Success Burn Rate
instance: "codeflare-operator"
namespace: "redhat-ods-applications"
severity: info
exp_annotations:
message: "High error budget burn for codeflare-operator (current value: 61)."
summary: CodeFlare Operator Probe Success Burn Rate
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-probe-success-burn-rate.md'

# operator running
- interval: 1m
input_series:
- series: up{job="CodeFlare Operator"}
values: 1
alert_rule_test:
- eval_time: 1m
alertname: CodeFlare Operator is not running
exp_alerts: []

- interval: 1m
input_series:
- series: up{job="CodeFlare Operator"}
values: 0
alert_rule_test:
- eval_time: 1m
alertname: CodeFlare Operator is not running
exp_alerts:
- exp_labels:
alertname: CodeFlare Operator is not running
job: "CodeFlare Operator"
namespace: "redhat-ods-applications"
severity: info
exp_annotations:
description: This alert fires when the CodeFlare Operator is not running.
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-availability.md'
summary: Alerting for CodeFlare Operator

- interval: 1m
input_series:
alert_rule_test:
- eval_time: 2m
alertname: CodeFlare Operator taking too long to be up
exp_alerts:
- exp_labels:
alertname: CodeFlare Operator taking too long to be up
namespace: "redhat-ods-applications"
job: "CodeFlare Operator"
severity: info
exp_annotations:
description: This alert fires when the CodeFlare Operator takes over 2 min. to come back online. Either CodeFlare Operator is not running and failing to become ready, is misconfigured, or the metrics endpoint is not responding.
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-absent-over-time.md'
summary: Alerting for CodeFlare Operator
Loading

0 comments on commit 4fbd91e

Please sign in to comment.