From 297eb400cf1c1797f439a093891c84dd30b739e8 Mon Sep 17 00:00:00 2001 From: Kyle Brennan Date: Tue, 1 Oct 2024 18:24:51 +0000 Subject: [PATCH 1/2] [ops] Introduce GitpodWsManagerMk2BackupFailureError and GitpodWsManagerMk2BackupFailureCritical --- .../workspace/rules/satellite/workspaces.yaml | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/operations/observability/mixins/workspace/rules/satellite/workspaces.yaml b/operations/observability/mixins/workspace/rules/satellite/workspaces.yaml index b33423403e67e5..89287a160f7454 100644 --- a/operations/observability/mixins/workspace/rules/satellite/workspaces.yaml +++ b/operations/observability/mixins/workspace/rules/satellite/workspaces.yaml @@ -45,3 +45,25 @@ spec: sum by(cluster) (avg_over_time(gitpod_workspace_regular_not_active_percentage_mk2[1m]) > 0) AND sum by(cluster) (rate(gitpod_ws_manager_mk2_workspace_startup_seconds_sum{type="Regular"}[1m])) == 0 + - alert: GitpodWsManagerMk2BackupFailureError + labels: + severity: error + team: engine + for: 1h + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WorkspaceBackupFailures.md + summary: Workspace backups failed recently in cluster {{ $labels.cluster }} + description: This can happen when a single node has failed in the cloud provider + expr: | + sum by (cluster) (increase(gitpod_ws_manager_mk2_workspace_backups_failure_total{cluster!~"ephemeral.*"}[1h])) <= 16 + - alert: GitpodWsManagerMk2BackupFailureCritical + labels: + severity: critical + team: engine + for: 1h + annotations: + runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WorkspaceBackupFailures.md + summary: Workspace backups failed recently in cluster {{ $labels.cluster }} + description: This can be an indicator of two or more nodes failing in a cloud provider + expr: | + sum by (cluster) (increase(gitpod_ws_manager_mk2_workspace_backups_failure_total{cluster!~"ephemeral.*"}[1h])) > 16 From ed79aa9bea77cc875b5d60016863b772dd611d55 Mon Sep 17 00:00:00 2001 From: Kyle Brennan Date: Tue, 1 Oct 2024 19:54:08 +0000 Subject: [PATCH 2/2] Fix --- .../mixins/workspace/rules/satellite/workspaces.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/operations/observability/mixins/workspace/rules/satellite/workspaces.yaml b/operations/observability/mixins/workspace/rules/satellite/workspaces.yaml index 89287a160f7454..3fa3eadce358e6 100644 --- a/operations/observability/mixins/workspace/rules/satellite/workspaces.yaml +++ b/operations/observability/mixins/workspace/rules/satellite/workspaces.yaml @@ -49,7 +49,6 @@ spec: labels: severity: error team: engine - for: 1h annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WorkspaceBackupFailures.md summary: Workspace backups failed recently in cluster {{ $labels.cluster }} @@ -60,7 +59,6 @@ spec: labels: severity: critical team: engine - for: 1h annotations: runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/WorkspaceBackupFailures.md summary: Workspace backups failed recently in cluster {{ $labels.cluster }}