From 3bbcaf075ef9adf0f139c395e62ab6fe281f49ea Mon Sep 17 00:00:00 2001 From: Coleen Iona Quadros Date: Wed, 20 Sep 2023 12:41:10 +0200 Subject: [PATCH] Remove noisy SLO alerts rules/raw (#600) * remove noisy alerts rules/raw * remove only GET endpoint --- configuration/observatorium/slo.go | 12 - ...s-slos-mst-production.prometheusrules.yaml | 207 ------------------ .../rhobs-slos-mst-stage.prometheusrules.yaml | 207 ------------------ ...slos-rhobsp02ue1-prod.prometheusrules.yaml | 207 ------------------ ...-telemeter-production.prometheusrules.yaml | 207 ------------------ ...-slos-telemeter-stage.prometheusrules.yaml | 207 ------------------ 6 files changed, 1047 deletions(-) diff --git a/configuration/observatorium/slo.go b/configuration/observatorium/slo.go index 958d9a9f61..b0b7842bfb 100644 --- a/configuration/observatorium/slo.go +++ b/configuration/observatorium/slo.go @@ -347,18 +347,6 @@ func ObservatoriumSLOs(envName rhobsInstanceEnv, signal signal) []pyrrav1alpha1. alertName: "APIRulesRawWriteAvailabilityErrorBudgetBurning", sloType: sloTypeAvailability, }, - { - name: "api-rules-raw-read-availability-slo", - labels: map[string]string{ - slo.PropagationLabelsPrefix + "service": "observatorium-api", - "instance": string(envName), - }, - description: "API /rules/raw endpoint for reads is burning too much error budget to guarantee availability SLOs.", - successOrErrorsExpr: "http_requests_total{job=\"" + apiJobSelector[envName] + "\", handler=\"rules-raw\", method=\"GET\", group=\"metricsv1\", code=~\"^5..$\"}", - totalExpr: "http_requests_total{job=\"" + apiJobSelector[envName] + "\", handler=\"rules-raw\", method=\"GET\", group=\"metricsv1\"}", - alertName: "APIRulesRawReadAvailabilityErrorBudgetBurning", - sloType: sloTypeAvailability, - }, { name: "api-rules-read-availability-slo", labels: map[string]string{ diff --git a/resources/observability/prometheusrules/rhobs-slos-mst-production.prometheusrules.yaml b/resources/observability/prometheusrules/rhobs-slos-mst-production.prometheusrules.yaml index c652856ae5..a1981b20a1 100755 --- a/resources/observability/prometheusrules/rhobs-slos-mst-production.prometheusrules.yaml +++ b/resources/observability/prometheusrules/rhobs-slos-mst-production.prometheusrules.yaml @@ -980,213 +980,6 @@ spec: labels: slo: api-rules-raw-write-availability-slo record: pyrra_errors_total - - interval: 2m30s - name: api-rules-raw-read-availability-slo-increase - rules: - - expr: sum by(code) (increase(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4w])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:increase4w - - alert: SLOMetricAbsent - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: absent(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - == 1 - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - severity: medium - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo - rules: - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[5m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[5m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate5m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[30m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[30m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate30m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[2h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[2h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate2h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[6h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[6h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate6h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1d - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate4d - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate5m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) and http_requests:burnrate1h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 5m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate30m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) and http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) - for: 15m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 6h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 30m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate2h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) and http_requests:burnrate1d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) - for: 1h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 2h - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) and http_requests:burnrate4d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) - for: 3h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 4d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 6h - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo-generic - rules: - - expr: "0.99" - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_objective - - expr: 2419200 - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_window - - expr: 1 - sum(http_requests:increase4w{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"} - or vector(0)) / sum(http_requests:increase4w{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_availability - - expr: sum(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_requests_total - - expr: sum(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"} - or vector(0)) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_errors_total - interval: 2m30s name: api-rules-read-availability-slo-increase rules: diff --git a/resources/observability/prometheusrules/rhobs-slos-mst-stage.prometheusrules.yaml b/resources/observability/prometheusrules/rhobs-slos-mst-stage.prometheusrules.yaml index 4621245a4b..becf3af7f0 100755 --- a/resources/observability/prometheusrules/rhobs-slos-mst-stage.prometheusrules.yaml +++ b/resources/observability/prometheusrules/rhobs-slos-mst-stage.prometheusrules.yaml @@ -980,213 +980,6 @@ spec: labels: slo: api-rules-raw-write-availability-slo record: pyrra_errors_total - - interval: 2m30s - name: api-rules-raw-read-availability-slo-increase - rules: - - expr: sum by(code) (increase(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4w])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:increase4w - - alert: SLOMetricAbsent - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: absent(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - == 1 - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - severity: medium - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo - rules: - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[5m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[5m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate5m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[30m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[30m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate30m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[2h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[2h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate2h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[6h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[6h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate6h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1d - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate4d - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate5m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) and http_requests:burnrate1h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 5m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate30m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) and http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) - for: 15m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 6h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 30m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate2h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) and http_requests:burnrate1d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) - for: 1h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 2h - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) and http_requests:burnrate4d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) - for: 3h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 4d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 6h - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo-generic - rules: - - expr: "0.99" - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_objective - - expr: 2419200 - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_window - - expr: 1 - sum(http_requests:increase4w{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"} - or vector(0)) / sum(http_requests:increase4w{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_availability - - expr: sum(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_requests_total - - expr: sum(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"} - or vector(0)) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_errors_total - interval: 2m30s name: api-rules-read-availability-slo-increase rules: diff --git a/resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1-prod.prometheusrules.yaml b/resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1-prod.prometheusrules.yaml index 01478ceb6c..6f9447290b 100755 --- a/resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1-prod.prometheusrules.yaml +++ b/resources/observability/prometheusrules/rhobs-slos-rhobsp02ue1-prod.prometheusrules.yaml @@ -980,213 +980,6 @@ spec: labels: slo: api-rules-raw-write-availability-slo record: pyrra_errors_total - - interval: 2m30s - name: api-rules-raw-read-availability-slo-increase - rules: - - expr: sum by(code) (increase(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4w])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:increase4w - - alert: SLOMetricAbsent - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: absent(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - == 1 - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - severity: medium - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo - rules: - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[5m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[5m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate5m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[30m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[30m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate30m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[2h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[2h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate2h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[6h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[6h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate6h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[1d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1d - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}[4d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate4d - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate5m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) and http_requests:burnrate1h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 5m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate30m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) and http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) - for: 15m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 6h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 30m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate2h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) and http_requests:burnrate1d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) - for: 1h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 1d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 2h - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) and http_requests:burnrate4d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) - for: 3h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-mst-api - long_burnrate_window: 4d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 6h - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo-generic - rules: - - expr: "0.99" - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_objective - - expr: 2419200 - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_window - - expr: 1 - sum(http_requests:increase4w{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"} - or vector(0)) / sum(http_requests:increase4w{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_availability - - expr: sum(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_requests_total - - expr: sum(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-mst-api",method="GET"} - or vector(0)) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_errors_total - interval: 2m30s name: api-rules-read-availability-slo-increase rules: diff --git a/resources/observability/prometheusrules/rhobs-slos-telemeter-production.prometheusrules.yaml b/resources/observability/prometheusrules/rhobs-slos-telemeter-production.prometheusrules.yaml index 9f22d8d0f9..955494cd13 100755 --- a/resources/observability/prometheusrules/rhobs-slos-telemeter-production.prometheusrules.yaml +++ b/resources/observability/prometheusrules/rhobs-slos-telemeter-production.prometheusrules.yaml @@ -1738,213 +1738,6 @@ spec: labels: slo: api-rules-raw-write-availability-slo record: pyrra_errors_total - - interval: 2m30s - name: api-rules-raw-read-availability-slo-increase - rules: - - expr: sum by(code) (increase(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[4w])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:increase4w - - alert: SLOMetricAbsent - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: absent(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}) - == 1 - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - severity: medium - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo - rules: - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[5m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[5m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate5m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[30m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[30m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate30m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[2h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[2h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate2h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[6h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[6h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate6h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1d - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[4d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[4d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate4d - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate5m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) and http_requests:burnrate1h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 1h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 5m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate30m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) and http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) - for: 15m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 6h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 30m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate2h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) and http_requests:burnrate1d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) - for: 1h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 1d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 2h - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) and http_requests:burnrate4d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) - for: 3h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 4d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 6h - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo-generic - rules: - - expr: "0.99" - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_objective - - expr: 2419200 - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_window - - expr: 1 - sum(http_requests:increase4w{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"} - or vector(0)) / sum(http_requests:increase4w{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_availability - - expr: sum(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_requests_total - - expr: sum(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"} - or vector(0)) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_errors_total - interval: 2m30s name: api-rules-read-availability-slo-increase rules: diff --git a/resources/observability/prometheusrules/rhobs-slos-telemeter-stage.prometheusrules.yaml b/resources/observability/prometheusrules/rhobs-slos-telemeter-stage.prometheusrules.yaml index 98a7fad94d..9f8c634e3e 100755 --- a/resources/observability/prometheusrules/rhobs-slos-telemeter-stage.prometheusrules.yaml +++ b/resources/observability/prometheusrules/rhobs-slos-telemeter-stage.prometheusrules.yaml @@ -1738,213 +1738,6 @@ spec: labels: slo: api-rules-raw-write-availability-slo record: pyrra_errors_total - - interval: 2m30s - name: api-rules-raw-read-availability-slo-increase - rules: - - expr: sum by(code) (increase(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[4w])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:increase4w - - alert: SLOMetricAbsent - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: absent(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}) - == 1 - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - severity: medium - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo - rules: - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[5m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[5m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate5m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[30m])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[30m])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate30m - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[2h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[2h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate2h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[6h])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[6h])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate6h - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[1d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate1d - - expr: sum(rate(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[4d])) - / sum(rate(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}[4d])) - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - method: GET - service: observatorium-api - slo: api-rules-raw-read-availability-slo - record: http_requests:burnrate4d - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate5m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) and http_requests:burnrate1h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (14 * (1-0.99)) - for: 2m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 1h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 5m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate30m{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) and http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (7 * (1-0.99)) - for: 15m - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 6h - method: GET - service: observatorium-api - severity: high - short_burnrate_window: 30m - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate2h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) and http_requests:burnrate1d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (2 * (1-0.99)) - for: 1h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 1d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 2h - slo: api-rules-raw-read-availability-slo - - alert: APIRulesRawReadAvailabilityErrorBudgetBurning - annotations: - dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m - message: API /rules/raw endpoint for reads is burning too much error budget - to guarantee availability SLOs. - runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning - expr: http_requests:burnrate6h{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) and http_requests:burnrate4d{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET",slo="api-rules-raw-read-availability-slo"} - > (1 * (1-0.99)) - for: 3h - labels: - group: metricsv1 - handler: rules-raw - job: observatorium-observatorium-api - long_burnrate_window: 4d - method: GET - service: observatorium-api - severity: medium - short_burnrate_window: 6h - slo: api-rules-raw-read-availability-slo - - interval: 30s - name: api-rules-raw-read-availability-slo-generic - rules: - - expr: "0.99" - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_objective - - expr: 2419200 - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_window - - expr: 1 - sum(http_requests:increase4w{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"} - or vector(0)) / sum(http_requests:increase4w{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_availability - - expr: sum(http_requests_total{group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"}) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_requests_total - - expr: sum(http_requests_total{code=~"^5..$",group="metricsv1",handler="rules-raw",job="observatorium-observatorium-api",method="GET"} - or vector(0)) - labels: - slo: api-rules-raw-read-availability-slo - record: pyrra_errors_total - interval: 2m30s name: api-rules-read-availability-slo-increase rules: