Skip to content

Commit

Permalink
Add obsctl reloader alerting rules (#603)
Browse files Browse the repository at this point in the history
* Add obsctl-reloader alert rules

* Refactor obsctl-reloader alert rules

* Fix typo

* Prefix alerts for easy eye grepping
  • Loading branch information
douglascamata authored Sep 22, 2023
1 parent 3bbcaf0 commit 7847d49
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 2 deletions.
4 changes: 2 additions & 2 deletions jsonnetfile.lock.json
Original file line number Diff line number Diff line change
Expand Up @@ -395,8 +395,8 @@
"subdir": "jsonnet/lib"
}
},
"version": "c720483113f66880d5d318adf707baf788e7fcfa",
"sum": "NGOtOzgw5pgFCt9+wxrDMTzRcymPxICzarVQPwf7Upk="
"version": "1df7a85a21606d7e1c42262a386dc6b377eb18b7",
"sum": "x00LDrH1x0wQWO95LiFaoUuFIPMWb1Acaem6ARPwaEk="
},
{
"source": {
Expand Down
5 changes: 5 additions & 0 deletions observability/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ local var = import 'utils.jsonnet';
instance_name_filter: var.instance_name_filter,
},
},
obsctlReloader: {
_config+:: {
obsctlReloaderSelector: 'job="rules-obsctl-reloader"',
},
},
alertmanager: (import 'github.com/prometheus/alertmanager/doc/alertmanager-mixin/config.libsonnet') {
title: 'Alertmanager / Overview',
_config+:: {
Expand Down
8 changes: 8 additions & 0 deletions observability/prometheusrules.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ local appSREOverwrites(environment) = {
std.startsWith(name, 'rhobs-mst') && environment == 'stage' then '92520ea4d6976f30d1618164e186ef9b'
else if
std.startsWith(name, 'gubernator') then 'no-dashboard'
else if
std.startsWith(name, 'obsctl-reloader') then 'no-dashboard'
else if
std.startsWith(name, 'alertmanager') then 'alertmanager-overview'
else error 'no dashboard id for group %s' % name,
Expand Down Expand Up @@ -466,3 +468,9 @@ local renderAlerts(name, environment, mixin) = {
'observatorium-http-traffic-stage.prometheusrules': renderAlerts('observatorium-http-traffic-stage', 'stage', httpTrafficMonitoringAlerts),
'observatorium-http-traffic-production.prometheusrules': renderAlerts('observatorium-http-traffic-production', 'production', httpTrafficMonitoringAlerts),
}

{
local obsctlReloader = (import 'github.com/rhobs/obsctl-reloader/jsonnet/lib/alerts.libsonnet') + config.obsctlReloader,
'observatorium-obsctl-reloader-stage.prometheusrules': renderAlerts('obsctl-reloader-stage', 'stage', obsctlReloader),
'observatorium-obsctl-reloader-production.prometheusrules': renderAlerts('obsctl-reloader-production', 'production', obsctlReloader),
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
---
$schema: /openshift/prometheus-rule-1.yml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: app-sre
role: alert-rules
name: obsctl-reloader-production
spec:
groups:
- name: obsctl-reloader.rules
rules:
- alert: ObsCtlRulesStoreServerError
annotations:
dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/obsctl-reloader.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
description: Failed to send rules from tenant {{ $labels.tenant }} to store {{ $value | humanizePercentage }}% of the time with a 5xx or 4xx status code.
message: Failed to send rules from tenant {{ $labels.tenant }} to store {{ $value | humanizePercentage }}% of the time with a 5xx or 4xx status code.
runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#obsctlrulesstoreservererror
summary: Failing to send rules to Observatorium.
expr: |
(
sum_over_time(obsctl_reloader_prom_rules_store_ops_total{status_code=~"5..|4..", job="rules-obsctl-reloader"}[5m])
/
sum(sum_over_time(obsctl_reloader_prom_rules_store_ops_total{job="rules-obsctl-reloader"}[5m]))
) or vector(0)
> 0.10
for: 10m
labels:
service: telemeter
severity: critical
- alert: ObsCtlRulesSetFailure
annotations:
dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/obsctl-reloader.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
description: obsctl-reloader is failing to set rules for tenant {{ $labels.tenant }} before reaching Observatorium {{ $value | humanizePercentage }}% of the time due to {{ $labels.reason }}.
message: obsctl-reloader is failing to set rules for tenant {{ $labels.tenant }} before reaching Observatorium {{ $value | humanizePercentage }}% of the time due to {{ $labels.reason }}.
runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#obsctlrulessetfailure
summary: Failing to set rules due to issue before talking to Observatorium.
expr: |
(
sum_over_time(obsctl_reloader_prom_rule_set_failures_total{reason!="rules_store_error", job="rules-obsctl-reloader"}[5m])
/
sum_over_time(obsctl_reloader_prom_rule_set_total{job="rules-obsctl-reloader"}[5m])
) or vector(0)
> 0.10
for: 10m
labels:
service: telemeter
severity: medium
- alert: ObsCtlFetchRulesFailed
annotations:
dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/obsctl-reloader.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
description: obsctl-reloader is failing to fetch rules via the PrometheusRule CRD in the local cluster.
message: obsctl-reloader is failing to fetch rules via the PrometheusRule CRD in the local cluster.
runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#obsctlfetchrulesfailed
summary: Failing to fetch rules from the local cluster.
expr: |
(
sum_over_time(obsctl_reloader_prom_rule_fetch_failures_total{job="rules-obsctl-reloader"}[5m])
/
sum_over_time(obsctl_reloader_prom_rule_fetches_total{job="rules-obsctl-reloader"}[5m])
) or vector(0)
> 0.20
for: 5m
labels:
service: telemeter
severity: critical
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
---
$schema: /openshift/prometheus-rule-1.yml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: app-sre
role: alert-rules
name: obsctl-reloader-stage
spec:
groups:
- name: obsctl-reloader.rules
rules:
- alert: ObsCtlRulesStoreServerError
annotations:
dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/obsctl-reloader.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
description: Failed to send rules from tenant {{ $labels.tenant }} to store {{ $value | humanizePercentage }}% of the time with a 5xx or 4xx status code.
message: Failed to send rules from tenant {{ $labels.tenant }} to store {{ $value | humanizePercentage }}% of the time with a 5xx or 4xx status code.
runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#obsctlrulesstoreservererror
summary: Failing to send rules to Observatorium.
expr: |
(
sum_over_time(obsctl_reloader_prom_rules_store_ops_total{status_code=~"5..|4..", job="rules-obsctl-reloader"}[5m])
/
sum(sum_over_time(obsctl_reloader_prom_rules_store_ops_total{job="rules-obsctl-reloader"}[5m]))
) or vector(0)
> 0.10
for: 10m
labels:
service: telemeter
severity: high
- alert: ObsCtlRulesSetFailure
annotations:
dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/obsctl-reloader.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
description: obsctl-reloader is failing to set rules for tenant {{ $labels.tenant }} before reaching Observatorium {{ $value | humanizePercentage }}% of the time due to {{ $labels.reason }}.
message: obsctl-reloader is failing to set rules for tenant {{ $labels.tenant }} before reaching Observatorium {{ $value | humanizePercentage }}% of the time due to {{ $labels.reason }}.
runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#obsctlrulessetfailure
summary: Failing to set rules due to issue before talking to Observatorium.
expr: |
(
sum_over_time(obsctl_reloader_prom_rule_set_failures_total{reason!="rules_store_error", job="rules-obsctl-reloader"}[5m])
/
sum_over_time(obsctl_reloader_prom_rule_set_total{job="rules-obsctl-reloader"}[5m])
) or vector(0)
> 0.10
for: 10m
labels:
service: telemeter
severity: medium
- alert: ObsCtlFetchRulesFailed
annotations:
dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/obsctl-reloader.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
description: obsctl-reloader is failing to fetch rules via the PrometheusRule CRD in the local cluster.
message: obsctl-reloader is failing to fetch rules via the PrometheusRule CRD in the local cluster.
runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#obsctlfetchrulesfailed
summary: Failing to fetch rules from the local cluster.
expr: |
(
sum_over_time(obsctl_reloader_prom_rule_fetch_failures_total{job="rules-obsctl-reloader"}[5m])
/
sum_over_time(obsctl_reloader_prom_rule_fetches_total{job="rules-obsctl-reloader"}[5m])
) or vector(0)
> 0.20
for: 5m
labels:
service: telemeter
severity: high

0 comments on commit 7847d49

Please sign in to comment.