diff --git a/src/loki_alert_rules/capacity.rules b/src/loki_alert_rules/capacity.rules index 56465e88f..bc9ad4a3f 100644 --- a/src/loki_alert_rules/capacity.rules +++ b/src/loki_alert_rules/capacity.rules @@ -6,6 +6,7 @@ groups: quantile_over_time(0.5, {filename="/var/log/github-runner-metrics.log"} | json event="event",duration="queue_duration",flavor="flavor" | __error__="" | event="runner_start" | unwrap duration[1h]) by(flavor) > 1800 labels: severity: high + type: runner-capacity for: 1h annotations: description: Job queue duration is higher than 30 minutes for half of the runners of application {{$labels.flavor}} @@ -15,6 +16,7 @@ groups: sum by(flavor)(last_over_time({filename="/var/log/github-runner-metrics.log"} | json event="event",idle_runners="idle_runners",flavor="flavor" | event="reconciliation" | unwrap idle_runners[1h])) == 0 labels: severity: high + type: runner-capacity for: 2h annotations: summary: No idle runners for application "{{$labels.flavor}}" @@ -23,6 +25,7 @@ groups: avg_over_time({filename="/var/log/github-runner-metrics.log"} | json event="event",idle="idle",flavor="flavor" | event="runner_start" | unwrap idle[1h]) by(flavor) < 300 labels: severity: high + type: runner-capacity for: 2h annotations: description: Idle time for application "{{$labels.flavor}}" is on average smaller than 5 minutes. diff --git a/src/loki_alert_rules/failure.rules b/src/loki_alert_rules/failure.rules index 6784be3cd..c788b33a0 100644 --- a/src/loki_alert_rules/failure.rules +++ b/src/loki_alert_rules/failure.rules @@ -6,6 +6,7 @@ groups: (sum_over_time({filename="/var/log/github-runner-metrics.log"} | json event="event", crashed_runners="crashed_runners" | event = `reconciliation` | unwrap crashed_runners [1h])) > 0 labels: severity: high + type: runner-failure for: 0s annotations: summary: A runner in unit {{ $labels.juju_unit }} crashed. @@ -14,6 +15,7 @@ groups: sum by (filename) (count_over_time({filename="/var/log/github-runner-metrics.log"} | json event="event" | event="runner_stop" | json status="status" | status="repo-policy-check-failure" | json http_code="status_info.code" | http_code=~"4.." [10m])) / sum by (filename) (count_over_time({filename="/var/log/github-runner-metrics.log"} | json event="event" | event="runner_stop" [10m])) > 0.5 labels: severity: high + type: runner-failure for: 3h annotations: description: More than 50 % of jobs have failed the repo-policy check (4xx status code) @@ -23,6 +25,7 @@ groups: count_over_time({filename="/var/log/github-runner-metrics.log"} | json event="event" | event="runner_stop" | json status="status",repo="repo" | status="repo-policy-check-failure" | json http_code="status_info.code" | http_code=~"5.." [1h]) > 0 labels: severity: high + type: runner-failure for: 0s annotations: description: A repo-policy server error ({{ $labels.http_code }}) was encountered in a runner in unit {{ $labels.juju_unit }} and repository {{ $labels.repo }}.