From ffd4f8c88b18c88edbe2614f5e327cfea1d829bf Mon Sep 17 00:00:00 2001 From: Coleen Iona Quadros Date: Wed, 20 Sep 2023 12:12:55 +0200 Subject: [PATCH] remove noisy alerts rules/raw --- configuration/observatorium/slo.go | 24 - ...s-slos-mst-production.prometheusrules.yaml | 414 ------------------ .../rhobs-slos-mst-stage.prometheusrules.yaml | 414 ------------------ ...slos-rhobsp02ue1-prod.prometheusrules.yaml | 414 ------------------ ...-telemeter-production.prometheusrules.yaml | 414 ------------------ ...-slos-telemeter-stage.prometheusrules.yaml | 414 ------------------ 6 files changed, 2094 deletions(-) diff --git a/configuration/observatorium/slo.go b/configuration/observatorium/slo.go index 958d9a9f61..965f64cc8b 100644 --- a/configuration/observatorium/slo.go +++ b/configuration/observatorium/slo.go @@ -335,30 +335,6 @@ func ObservatoriumSLOs(envName rhobsInstanceEnv, signal signal) []pyrrav1alpha1. alertName: "APIMetricsQueryRangeAvailabilityErrorBudgetBurning", sloType: sloTypeAvailability, }, - { - name: "api-rules-raw-write-availability-slo", - labels: map[string]string{ - slo.PropagationLabelsPrefix + "service": "observatorium-api", - "instance": string(envName), - }, - description: "API /rules/raw endpoint for writes is burning too much error budget to guarantee availability SLOs.", - successOrErrorsExpr: "http_requests_total{job=\"" + apiJobSelector[envName] + "\", handler=\"rules-raw\", method=\"PUT\", group=\"metricsv1\", code=~\"^5..$\"}", - totalExpr: "http_requests_total{job=\"" + apiJobSelector[envName] + "\", handler=\"rules-raw\", method=\"PUT\", group=\"metricsv1\"}", - alertName: "APIRulesRawWriteAvailabilityErrorBudgetBurning", - sloType: sloTypeAvailability, - }, - { - name: "api-rules-raw-read-availability-slo", - labels: map[string]string{ - slo.PropagationLabelsPrefix + "service": "observatorium-api", - "instance": string(envName), - }, - description: "API /rules/raw endpoint for reads is burning too much error budget to guarantee availability SLOs.", - successOrErrorsExpr: "http_requests_total{job=\"" + apiJobSelector[envName] + "\", handler=\"rules-raw\", method=\"GET\", group=\"metricsv1\", code=~\"^5..$\"}", - totalExpr: "http_requests_total{job=\"" + apiJobSelector[envName] + "\", handler=\"rules-raw\", method=\"GET\", group=\"metricsv1\"}", - alertName: "APIRulesRawReadAvailabilityErrorBudgetBurning", - sloType: sloTypeAvailability, - }, { name: "api-rules-read-availability-slo", labels: map[string]string{ diff --git a/resources/observability/prometheusrules/rhobs-slos-mst-production.prometheusrules.yaml b/resources/observability/prometheusrules/rhobs-slos-mst-production.prometheusrules.yaml index c652856ae5..6ac0a4fbcb 100755 --- a/resources/observability/prometheusrules/rhobs-slos-mst-production.prometheusrules.yaml +++ b/resources/observability/prometheusrules/rhobs-slos-mst-production.prometheusrules.yaml @@ -773,420 +773,6 @@ spec: labels: slo: api-metrics-query-range-availability-slo record: pyrra_errors_total - - interval: 2m30s - name: api-rules-raw-write-availability-slo-increase - rules: - - expr: sum by(code) (increase(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[4w])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:increase4w - - alert: SLOMetricAbsent - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: absent(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}) - == 1 - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - severity: medium - slo: api-rules-raw-write-availability-slo - - interval: 30s - name: api-rules-raw-write-availability-slo - rules: - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[5m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[5m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate5m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[30m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[30m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate30m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[1h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[1h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate1h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[2h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[2h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate2h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[6h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[6h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate6h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[1d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[1d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate1d - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[4d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[4d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate4d - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate5m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (14 * (1-0.99)) and http_requests:burnrate1h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (14 * (1-0.99)) - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1h - method: PUT - service: observatorium-api - severity: high - short_burnrate_window: 5m - slo: api-rules-raw-write-availability-slo - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate30m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (7 * (1-0.99)) and http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (7 * (1-0.99)) - for: 15m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 6h - method: PUT - service: observatorium-api - severity: high - short_burnrate_window: 30m - slo: api-rules-raw-write-availability-slo - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate2h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (2 * (1-0.99)) and http_requests:burnrate1d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (2 * (1-0.99)) - for: 1h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1d - method: PUT - service: observatorium-api - severity: medium - short_burnrate_window: 2h - slo: api-rules-raw-write-availability-slo - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (1 * (1-0.99)) and http_requests:burnrate4d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (1 * (1-0.99)) - for: 3h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 4d - method: PUT - service: observatorium-api - severity: medium - short_burnrate_window: 6h - slo: api-rules-raw-write-availability-slo - - interval: 30s - name: api-rules-raw-write-availability-slo-generic - rules: - - expr: "0.99" - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_objective - - expr: 2419200 - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_window - - expr: 1 - sum(http_requests:increase4w{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"} - or vector(0)) / sum(http_requests:increase4w{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}) - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_availability - - expr: sum(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}) - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_requests_total - - expr: sum(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"} - or vector(0)) - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_errors_total - - interval: 2m30s - name: api-rules-raw-read-availability-slo-increase - rules: - - expr: sum by(code) (increase(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4w])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:increase4w - - alert: SLOMetricAbsent - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: absent(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - == 1 - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - severity: medium - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo - rules: - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[5m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[5m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate5m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[30m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[30m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate30m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[2h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[2h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate2h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[6h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[6h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate6h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1d - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate4d - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate5m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) and http_requests:burnrate1h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 5m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate30m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) and http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) - for: 15m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 6h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 30m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate2h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) and http_requests:burnrate1d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) - for: 1h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 2h - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) and http_requests:burnrate4d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) - for: 3h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 4d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 6h - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo-generic - rules: - - expr: "0.99" - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_objective - - expr: 2419200 - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_window - - expr: 1 - sum(http_requests:increase4w{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"} - or vector(0)) / sum(http_requests:increase4w{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_availability - - expr: sum(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_requests_total - - expr: sum(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"} - or vector(0)) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_errors_total - interval: 2m30s name: api-rules-read-availability-slo-increase rules: diff --git a/resources/observability/prometheusrules/rhobs-slos-mst-stage.prometheusrules.yaml b/resources/observability/prometheusrules/rhobs-slos-mst-stage.prometheusrules.yaml index 4621245a4b..9e5ff3e320 100755 --- a/resources/observability/prometheusrules/rhobs-slos-mst-stage.prometheusrules.yaml +++ b/resources/observability/prometheusrules/rhobs-slos-mst-stage.prometheusrules.yaml @@ -773,420 +773,6 @@ spec: labels: slo: api-metrics-query-range-availability-slo record: pyrra_errors_total - - interval: 2m30s - name: api-rules-raw-write-availability-slo-increase - rules: - - expr: sum by(code) (increase(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[4w])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:increase4w - - alert: SLOMetricAbsent - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: absent(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}) - == 1 - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - severity: medium - slo: api-rules-raw-write-availability-slo - - interval: 30s - name: api-rules-raw-write-availability-slo - rules: - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[5m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[5m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate5m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[30m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[30m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate30m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[1h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[1h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate1h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[2h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[2h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate2h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[6h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[6h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate6h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[1d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[1d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate1d - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[4d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[4d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate4d - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate5m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (14 * (1-0.99)) and http_requests:burnrate1h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (14 * (1-0.99)) - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1h - method: PUT - service: observatorium-api - severity: high - short_burnrate_window: 5m - slo: api-rules-raw-write-availability-slo - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate30m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (7 * (1-0.99)) and http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (7 * (1-0.99)) - for: 15m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 6h - method: PUT - service: observatorium-api - severity: high - short_burnrate_window: 30m - slo: api-rules-raw-write-availability-slo - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate2h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (2 * (1-0.99)) and http_requests:burnrate1d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (2 * (1-0.99)) - for: 1h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1d - method: PUT - service: observatorium-api - severity: medium - short_burnrate_window: 2h - slo: api-rules-raw-write-availability-slo - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (1 * (1-0.99)) and http_requests:burnrate4d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (1 * (1-0.99)) - for: 3h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 4d - method: PUT - service: observatorium-api - severity: medium - short_burnrate_window: 6h - slo: api-rules-raw-write-availability-slo - - interval: 30s - name: api-rules-raw-write-availability-slo-generic - rules: - - expr: "0.99" - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_objective - - expr: 2419200 - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_window - - expr: 1 - sum(http_requests:increase4w{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"} - or vector(0)) / sum(http_requests:increase4w{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}) - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_availability - - expr: sum(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}) - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_requests_total - - expr: sum(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"} - or vector(0)) - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_errors_total - - interval: 2m30s - name: api-rules-raw-read-availability-slo-increase - rules: - - expr: sum by(code) (increase(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4w])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:increase4w - - alert: SLOMetricAbsent - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: absent(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - == 1 - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - severity: medium - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo - rules: - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[5m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[5m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate5m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[30m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[30m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate30m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[2h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[2h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate2h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[6h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[6h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate6h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1d - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate4d - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate5m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) and http_requests:burnrate1h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 5m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate30m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) and http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) - for: 15m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 6h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 30m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate2h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) and http_requests:burnrate1d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) - for: 1h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 2h - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) and http_requests:burnrate4d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) - for: 3h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 4d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 6h - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo-generic - rules: - - expr: "0.99" - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_objective - - expr: 2419200 - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_window - - expr: 1 - sum(http_requests:increase4w{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"} - or vector(0)) / sum(http_requests:increase4w{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_availability - - expr: sum(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_requests_total - - expr: sum(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"} - or vector(0)) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_errors_total - interval: 2m30s name: api-rules-read-availability-slo-increase rules: diff --git a/resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1-prod.prometheusrules.yaml b/resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1-prod.prometheusrules.yaml index 01478ceb6c..b41303924f 100755 --- a/resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1-prod.prometheusrules.yaml +++ b/resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1-prod.prometheusrules.yaml @@ -773,420 +773,6 @@ spec: labels: slo: api-metrics-query-range-availability-slo record: pyrra_errors_total - - interval: 2m30s - name: api-rules-raw-write-availability-slo-increase - rules: - - expr: sum by(code) (increase(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[4w])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:increase4w - - alert: SLOMetricAbsent - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: absent(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}) - == 1 - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - severity: medium - slo: api-rules-raw-write-availability-slo - - interval: 30s - name: api-rules-raw-write-availability-slo - rules: - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[5m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[5m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate5m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[30m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[30m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate30m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[1h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[1h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate1h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[2h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[2h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate2h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[6h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[6h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate6h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[1d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[1d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate1d - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[4d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}[4d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate4d - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate5m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (14 * (1-0.99)) and http_requests:burnrate1h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (14 * (1-0.99)) - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1h - method: PUT - service: observatorium-api - severity: high - short_burnrate_window: 5m - slo: api-rules-raw-write-availability-slo - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate30m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (7 * (1-0.99)) and http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (7 * (1-0.99)) - for: 15m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 6h - method: PUT - service: observatorium-api - severity: high - short_burnrate_window: 30m - slo: api-rules-raw-write-availability-slo - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate2h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (2 * (1-0.99)) and http_requests:burnrate1d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (2 * (1-0.99)) - for: 1h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1d - method: PUT - service: observatorium-api - severity: medium - short_burnrate_window: 2h - slo: api-rules-raw-write-availability-slo - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (1 * (1-0.99)) and http_requests:burnrate4d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (1 * (1-0.99)) - for: 3h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 4d - method: PUT - service: observatorium-api - severity: medium - short_burnrate_window: 6h - slo: api-rules-raw-write-availability-slo - - interval: 30s - name: api-rules-raw-write-availability-slo-generic - rules: - - expr: "0.99" - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_objective - - expr: 2419200 - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_window - - expr: 1 - sum(http_requests:increase4w{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"} - or vector(0)) / sum(http_requests:increase4w{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}) - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_availability - - expr: sum(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"}) - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_requests_total - - expr: sum(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="PUT"} - or vector(0)) - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_errors_total - - interval: 2m30s - name: api-rules-raw-read-availability-slo-increase - rules: - - expr: sum by(code) (increase(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4w])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:increase4w - - alert: SLOMetricAbsent - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: absent(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - == 1 - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - severity: medium - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo - rules: - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[5m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[5m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate5m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[30m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[30m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate30m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[2h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[2h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate2h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[6h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[6h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate6h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1d - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate4d - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate5m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) and http_requests:burnrate1h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 5m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate30m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) and http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) - for: 15m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 6h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 30m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate2h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) and http_requests:burnrate1d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) - for: 1h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 2h - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) and http_requests:burnrate4d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) - for: 3h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 4d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 6h - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo-generic - rules: - - expr: "0.99" - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_objective - - expr: 2419200 - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_window - - expr: 1 - sum(http_requests:increase4w{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"} - or vector(0)) / sum(http_requests:increase4w{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_availability - - expr: sum(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_requests_total - - expr: sum(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"} - or vector(0)) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_errors_total - interval: 2m30s name: api-rules-read-availability-slo-increase rules: diff --git a/resources/observability/prometheusrules/rhobs-slos-telemeter-production.prometheusrules.yaml b/resources/observability/prometheusrules/rhobs-slos-telemeter-production.prometheusrules.yaml index 9f22d8d0f9..2f880061c8 100755 --- a/resources/observability/prometheusrules/rhobs-slos-telemeter-production.prometheusrules.yaml +++ b/resources/observability/prometheusrules/rhobs-slos-telemeter-production.prometheusrules.yaml @@ -1531,420 +1531,6 @@ spec: labels: slo: api-metrics-query-range-availability-slo record: pyrra_errors_total - - interval: 2m30s - name: api-rules-raw-write-availability-slo-increase - rules: - - expr: sum by(code) (increase(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[4w])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:increase4w - - alert: SLOMetricAbsent - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: absent(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}) - == 1 - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - severity: medium - slo: api-rules-raw-write-availability-slo - - interval: 30s - name: api-rules-raw-write-availability-slo - rules: - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[5m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[5m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate5m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[30m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[30m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate30m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[1h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[1h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate1h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[2h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[2h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate2h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[6h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[6h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate6h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[1d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[1d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate1d - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[4d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[4d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate4d - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate5m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (14 * (1-0.99)) and http_requests:burnrate1h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (14 * (1-0.99)) - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 1h - method: PUT - service: observatorium-api - severity: high - short_burnrate_window: 5m - slo: api-rules-raw-write-availability-slo - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate30m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (7 * (1-0.99)) and http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (7 * (1-0.99)) - for: 15m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 6h - method: PUT - service: observatorium-api - severity: high - short_burnrate_window: 30m - slo: api-rules-raw-write-availability-slo - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate2h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (2 * (1-0.99)) and http_requests:burnrate1d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (2 * (1-0.99)) - for: 1h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 1d - method: PUT - service: observatorium-api - severity: medium - short_burnrate_window: 2h - slo: api-rules-raw-write-availability-slo - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (1 * (1-0.99)) and http_requests:burnrate4d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (1 * (1-0.99)) - for: 3h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 4d - method: PUT - service: observatorium-api - severity: medium - short_burnrate_window: 6h - slo: api-rules-raw-write-availability-slo - - interval: 30s - name: api-rules-raw-write-availability-slo-generic - rules: - - expr: "0.99" - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_objective - - expr: 2419200 - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_window - - expr: 1 - sum(http_requests:increase4w{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"} - or vector(0)) / sum(http_requests:increase4w{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}) - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_availability - - expr: sum(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}) - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_requests_total - - expr: sum(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"} - or vector(0)) - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_errors_total - - interval: 2m30s - name: api-rules-raw-read-availability-slo-increase - rules: - - expr: sum by(code) (increase(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[4w])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:increase4w - - alert: SLOMetricAbsent - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: absent(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}) - == 1 - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - severity: medium - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo - rules: - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[5m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[5m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate5m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[30m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[30m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate30m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[2h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[2h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate2h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[6h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[6h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate6h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1d - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[4d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[4d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate4d - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate5m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) and http_requests:burnrate1h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 1h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 5m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate30m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) and http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) - for: 15m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 6h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 30m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate2h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) and http_requests:burnrate1d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) - for: 1h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 1d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 2h - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) and http_requests:burnrate4d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) - for: 3h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 4d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 6h - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo-generic - rules: - - expr: "0.99" - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_objective - - expr: 2419200 - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_window - - expr: 1 - sum(http_requests:increase4w{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"} - or vector(0)) / sum(http_requests:increase4w{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_availability - - expr: sum(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_requests_total - - expr: sum(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"} - or vector(0)) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_errors_total - interval: 2m30s name: api-rules-read-availability-slo-increase rules: diff --git a/resources/observability/prometheusrules/rhobs-slos-telemeter-stage.prometheusrules.yaml b/resources/observability/prometheusrules/rhobs-slos-telemeter-stage.prometheusrules.yaml index 98a7fad94d..d2885f8a13 100755 --- a/resources/observability/prometheusrules/rhobs-slos-telemeter-stage.prometheusrules.yaml +++ b/resources/observability/prometheusrules/rhobs-slos-telemeter-stage.prometheusrules.yaml @@ -1531,420 +1531,6 @@ spec: labels: slo: api-metrics-query-range-availability-slo record: pyrra_errors_total - - interval: 2m30s - name: api-rules-raw-write-availability-slo-increase - rules: - - expr: sum by(code) (increase(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[4w])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:increase4w - - alert: SLOMetricAbsent - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: absent(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}) - == 1 - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - severity: medium - slo: api-rules-raw-write-availability-slo - - interval: 30s - name: api-rules-raw-write-availability-slo - rules: - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[5m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[5m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate5m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[30m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[30m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate30m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[1h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[1h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate1h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[2h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[2h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate2h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[6h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[6h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate6h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[1d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[1d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate1d - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[4d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}[4d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: PUT - service: observatorium-api - slo: api-rules-raw-write-availability-slo - record: http_requests:burnrate4d - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate5m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (14 * (1-0.99)) and http_requests:burnrate1h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (14 * (1-0.99)) - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 1h - method: PUT - service: observatorium-api - severity: high - short_burnrate_window: 5m - slo: api-rules-raw-write-availability-slo - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate30m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (7 * (1-0.99)) and http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (7 * (1-0.99)) - for: 15m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 6h - method: PUT - service: observatorium-api - severity: high - short_burnrate_window: 30m - slo: api-rules-raw-write-availability-slo - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate2h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (2 * (1-0.99)) and http_requests:burnrate1d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (2 * (1-0.99)) - for: 1h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 1d - method: PUT - service: observatorium-api - severity: medium - short_burnrate_window: 2h - slo: api-rules-raw-write-availability-slo - - alert: APIRulesRawWriteAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for writes is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning - expr: http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (1 * (1-0.99)) and http_requests:burnrate4d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT",slo="api-rules-raw-write-availability-slo"} - > (1 * (1-0.99)) - for: 3h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 4d - method: PUT - service: observatorium-api - severity: medium - short_burnrate_window: 6h - slo: api-rules-raw-write-availability-slo - - interval: 30s - name: api-rules-raw-write-availability-slo-generic - rules: - - expr: "0.99" - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_objective - - expr: 2419200 - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_window - - expr: 1 - sum(http_requests:increase4w{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"} - or vector(0)) / sum(http_requests:increase4w{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}) - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_availability - - expr: sum(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"}) - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_requests_total - - expr: sum(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="PUT"} - or vector(0)) - labels: - slo: api-rules-raw-write-availability-slo - record: pyrra_errors_total - - interval: 2m30s - name: api-rules-raw-read-availability-slo-increase - rules: - - expr: sum by(code) (increase(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[4w])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:increase4w - - alert: SLOMetricAbsent - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: absent(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}) - == 1 - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - severity: medium - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo - rules: - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[5m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[5m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate5m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[30m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[30m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate30m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[2h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[2h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate2h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[6h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[6h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate6h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1d - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[4d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[4d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate4d - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate5m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) and http_requests:burnrate1h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 1h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 5m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate30m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) and http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) - for: 15m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 6h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 30m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate2h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) and http_requests:burnrate1d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) - for: 1h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 1d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 2h - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) and http_requests:burnrate4d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) - for: 3h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 4d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 6h - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo-generic - rules: - - expr: "0.99" - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_objective - - expr: 2419200 - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_window - - expr: 1 - sum(http_requests:increase4w{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"} - or vector(0)) / sum(http_requests:increase4w{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_availability - - expr: sum(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_requests_total - - expr: sum(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"} - or vector(0)) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_errors_total - interval: 2m30s name: api-rules-read-availability-slo-increase rules: