From 7682c1ab6ce7ad45b6f84b1467e480a572c18336 Mon Sep 17 00:00:00 2001 From: Simon Gerber Date: Tue, 7 Jan 2025 10:23:12 +0100 Subject: [PATCH] Add dynamic `syn_team` label to alerts We use a small go template to extract the value for `syn_team` from the `project` label of the metric so we can correctly label each individual alert instance. This should allow us to group ArgoCD alerts by `syn_team` (which is already well-defined for alert routing) to ensure that alerts for each AppProject are routed to the responsible team. --- component/monitoring.libsonnet | 23 ++++++++++++++----- .../argocd/01_namespace/20_monitoring.yaml | 6 +++++ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/component/monitoring.libsonnet b/component/monitoring.libsonnet index 88eb31b9..23307d75 100644 --- a/component/monitoring.libsonnet +++ b/component/monitoring.libsonnet @@ -1,6 +1,8 @@ local kap = import 'lib/kapitan.libjsonnet'; local kube = import 'lib/kube.libjsonnet'; local prometheus = import 'lib/prometheus.libsonnet'; +local syn_teams = import 'syn/syn-teams.libsonnet'; + local inv = kap.inventory(); local params = inv.parameters.argocd; @@ -29,6 +31,12 @@ local serviceMonitor(objname, name) = }; local alert_rules = + local team_label = + if syn_teams.owner != null then + '{{if eq $labels.project "syn"}}{{ "%s" }}{{else}}{{ $labels.project }}{{end}}' % syn_teams.owner + else + null; + kube._Object('monitoring.coreos.com/v1', 'PrometheusRule', 'argocd') { metadata: { name: 'argocd', @@ -47,10 +55,11 @@ local alert_rules = alert: 'ArgoCDAppUnsynced', expr: 'argocd_app_info{exported_namespace="' + params.namespace + '", sync_status!="Synced"} > 0', 'for': '10m', - labels: { + labels: std.prune({ severity: 'warning', syn: 'true', - }, + syn_team: team_label, + }), annotations: { message: 'Argo CD app {{ $labels.name }} is not synced', description: 'kubectl -n ' + params.namespace + ' describe app {{ $labels.name }}', @@ -61,10 +70,11 @@ local alert_rules = alert: 'ArgoCDAppUnhealthy', expr: 'argocd_app_info{exported_namespace="' + params.namespace + '", health_status!="Healthy"} > 0', 'for': '10m', - labels: { + labels: std.prune({ severity: 'critical', syn: 'true', - }, + syn_team: team_label, + }), annotations: { message: 'Argo CD app {{ $labels.name }} is not healthy', description: 'kubectl -n ' + params.namespace + ' describe app {{ $labels.name }}', @@ -75,10 +85,11 @@ local alert_rules = alert: 'ArgoCDDown', expr: 'up{namespace="' + params.namespace + '", job=~"^syn-argocd-.+$"} != 1', 'for': '5m', - labels: { + labels: std.prune({ severity: 'critical', syn: 'true', - }, + syn_team: team_label, + }), annotations: { message: 'Argo CD job {{ $labels.job }} is down', dashboard: 'argocd', diff --git a/tests/golden/syn-teams/argocd/argocd/01_namespace/20_monitoring.yaml b/tests/golden/syn-teams/argocd/argocd/01_namespace/20_monitoring.yaml index a203bcca..ef3e004f 100644 --- a/tests/golden/syn-teams/argocd/argocd/01_namespace/20_monitoring.yaml +++ b/tests/golden/syn-teams/argocd/argocd/01_namespace/20_monitoring.yaml @@ -75,6 +75,8 @@ spec: labels: severity: warning syn: 'true' + syn_team: '{{if eq $labels.project "syn"}}{{ "sparkling-sound" }}{{else}}{{ + $labels.project }}{{end}}' - alert: ArgoCDAppUnhealthy annotations: dashboard: argocd @@ -86,6 +88,8 @@ spec: labels: severity: critical syn: 'true' + syn_team: '{{if eq $labels.project "syn"}}{{ "sparkling-sound" }}{{else}}{{ + $labels.project }}{{end}}' - alert: ArgoCDDown annotations: dashboard: argocd @@ -95,3 +99,5 @@ spec: labels: severity: critical syn: 'true' + syn_team: '{{if eq $labels.project "syn"}}{{ "sparkling-sound" }}{{else}}{{ + $labels.project }}{{end}}'