From 09ca237d4c3cef7c5df80b848130dfb80f06e128 Mon Sep 17 00:00:00 2001 From: I753292 Date: Wed, 8 Jan 2025 23:53:11 +0100 Subject: [PATCH 01/11] Prometheus-Alerting:added template for bedrock alerts in vmware alerting --- .../templates/am-config-route-slack.yaml | 38 ++++++++++++++++++ .../templates/_helper.tpl | 40 ++++++++++++++++++- .../templates/prometheus-alerts.yaml | 24 ++++++++++- 3 files changed, 99 insertions(+), 3 deletions(-) diff --git a/global/prometheus-alertmanager-operated/templates/am-config-route-slack.yaml b/global/prometheus-alertmanager-operated/templates/am-config-route-slack.yaml index d4938a1d179..7bc08f33cac 100644 --- a/global/prometheus-alertmanager-operated/templates/am-config-route-slack.yaml +++ b/global/prometheus-alertmanager-operated/templates/am-config-route-slack.yaml @@ -450,6 +450,44 @@ spec: matchType: "=" value: compute + - receiver: slack_bedrock_critical + continue: true + matchers: + - name: tier + matchType: "=" + value: vmware + - name: severity + matchType: "=" + value: critical + - name: region + matchType: "=~" + value: {{ without .Values.regions "qa-de-1" | join "|" }} + - name: support_group + matchType: "=" + value: compute + - name: bedrock + matchType: "=" + value: true + + - receiver: slack_bedrock_warning + continue: true + matchers: + - name: tier + matchType: "=" + value: vmware + - name: severity + matchType: "=" + value: warning + - name: region + matchType: "=~" + value: {{ without .Values.regions "qa-de-1" | join "|" }} + - name: support_group + matchType: "=" + value: compute + - name: bedrock + matchType: "=" + value: true + # rework to match support group compute needed - receiver: slack_vmware_info continue: true diff --git a/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl b/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl index c3740360d74..545c2c21cbb 100644 --- a/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl +++ b/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl @@ -3,4 +3,42 @@ {{- $root := index . 1 -}} {{- $vropshostname := split "." $name -}} vmware-{{ $vropshostname._0 | trimPrefix "vrops-" }} -{{- end -}} \ No newline at end of file +{{- end -}} + +{{- define "bedrockConfirm.expr" -}} +{{- $expr := index . 0 -}} +{{- $mappingKey := index . 1 -}} +expr: > + label_replace( + {{ $expr }} unless on({{ $mappingKey }}) + ( + group by (project, {{ $mappingKey }}) ( + vrops_virtualmachine_system_powered_on{region="eu-de-1", vccluster=~"^productionbb\\d+$"} + ) + and on(project) + group by (project) ( + label_replace( + limes_project_usage{domain=~"iaas-.*"}, + "project", "$$1", "project_id", "(.*)" + ) + ) + ), + "bedrock", "no", "bedrock", "") + or + label_replace( + {{ $expr }} and on({{ $mappingKey }}) + ( + group by (project, {{ $mappingKey }}) ( + vrops_virtualmachine_system_powered_on{region="eu-de-1", vccluster=~"^productionbb\\d+$"} + ) + and on(project) + group by (project) ( + label_replace( + limes_project_usage{domain=~"iaas-.*"}, + "project", "$$1", "project_id", "(.*)" + ) + ) + ), + "bedrock", "yes", "bedrock", "" + ) +{{- end -}} diff --git a/prometheus-rules/prometheus-vmware-rules/templates/prometheus-alerts.yaml b/prometheus-rules/prometheus-vmware-rules/templates/prometheus-alerts.yaml index 26d98bff22d..b7350cf380c 100644 --- a/prometheus-rules/prometheus-vmware-rules/templates/prometheus-alerts.yaml +++ b/prometheus-rules/prometheus-vmware-rules/templates/prometheus-alerts.yaml @@ -1,4 +1,12 @@ {{- $root := . }} +{{- $bedrockAlerts := .Values.bedrockAlerts }} +{{- $filteredBedrockAlerts := dict }} + +{{- range $key, $value := $bedrockAlerts }} + {{- if $value }} + {{- $filteredBedrockAlerts = merge $filteredBedrockAlerts (dict $key $value) }} + {{- end }} +{{- end }} {{- range $target := .Values.global.targets }} {{- range $path, $bytes := $.Files.Glob "alerts/*.alerts" }} --- @@ -11,7 +19,19 @@ metadata: prometheus: {{ include "prometheusVMware.name" (list $target $root) }} spec: -{{ printf "%s" $bytes | indent 2 }} - +{{- $string := $bytes | toString }} +{{- $string := (regexReplaceAll "\\n\\s+\\n" $string "\n\n") }} +{{- range $alert := splitList "\n\n" $string }} +{{- $alertname := (regexReplaceAll "[\\s\\S]+?alert: (\\S+)\\n[\\s\\S]+" $alert "${1}") }} +{{- if has $alertname (keys $filteredBedrockAlerts) }} +# Alert expr. templated with bedrock alerts template +{{- $mappingKey := (printf "%s" (get $bedrockAlerts $alertname)) }} +{{ regexReplaceAll "([\\s\\S]+?- alert: \\S+)\\n([\\s\\S]+?expr:[\\s\\S]+)" $alert "${1}" | indent 2 }} +{{ regexReplaceAll "[\\s\\S]+expr.+?(>\\n|\\w|\\|\\n)\\s+([\\s\\S]+?)\\s+\\S+:[\\s\\S]+" $alert (include "bedrockConfirm.expr" (list "$2" $mappingKey)) | indent 6 }} +{{- regexReplaceAll "([\\s\\S]+?expr:[\\s\\S]+?)\\n(\\s+\\S+:[\\s\\S]+)" $alert "${2}" | nindent 2 }} +{{- else }} +{{ printf "%s" $alert | indent 2 }} +{{- end }} {{- end }} {{- end }} +{{- end }} \ No newline at end of file From 9fac8070814e1b77f5c006e040bded75483211bd Mon Sep 17 00:00:00 2001 From: christophrichtersap Date: Thu, 16 Jan 2025 17:30:35 +0100 Subject: [PATCH 02/11] Apply suggestions from code review Co-authored-by: Tommy Sauer --- prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl b/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl index 545c2c21cbb..743e6de5148 100644 --- a/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl +++ b/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl @@ -23,7 +23,7 @@ expr: > ) ) ), - "bedrock", "no", "bedrock", "") + "bedrock", "false", "bedrock", "") or label_replace( {{ $expr }} and on({{ $mappingKey }}) From b61039eacede81884abf5a38102a9275094c7304 Mon Sep 17 00:00:00 2001 From: christophrichtersap Date: Thu, 16 Jan 2025 17:30:53 +0100 Subject: [PATCH 03/11] Apply suggestions from code review Co-authored-by: Tommy Sauer --- prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl b/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl index 743e6de5148..4f3daa20e70 100644 --- a/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl +++ b/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl @@ -39,6 +39,6 @@ expr: > ) ) ), - "bedrock", "yes", "bedrock", "" + "bedrock", "true", "bedrock", "" ) {{- end -}} From e7d48fd2ae05f05381ff46a6dca8a5679d64b44e Mon Sep 17 00:00:00 2001 From: I753292 Date: Thu, 16 Jan 2025 17:37:32 +0100 Subject: [PATCH 04/11] Resolved: move the whole section behind `lack_vmware_info` Resolved: Added commented example of data in `prometheus-rules/prometheus-vmware-rules/values.yaml` Resolved: Added comments on `prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl` --- .../templates/am-config-route-slack.yaml | 24 +++++++++---------- .../templates/_helper.tpl | 11 +++++++-- .../templates/prometheus-alerts.yaml | 1 - .../prometheus-vmware-rules/values.yaml | 4 ++++ 4 files changed, 25 insertions(+), 15 deletions(-) diff --git a/global/prometheus-alertmanager-operated/templates/am-config-route-slack.yaml b/global/prometheus-alertmanager-operated/templates/am-config-route-slack.yaml index 7bc08f33cac..5bfc0d70b79 100644 --- a/global/prometheus-alertmanager-operated/templates/am-config-route-slack.yaml +++ b/global/prometheus-alertmanager-operated/templates/am-config-route-slack.yaml @@ -450,7 +450,8 @@ spec: matchType: "=" value: compute - - receiver: slack_bedrock_critical + # rework to match support group compute needed + - receiver: slack_vmware_info continue: true matchers: - name: tier @@ -458,18 +459,15 @@ spec: value: vmware - name: severity matchType: "=" - value: critical + value: info - name: region matchType: "=~" value: {{ without .Values.regions "qa-de-1" | join "|" }} - name: support_group matchType: "=" value: compute - - name: bedrock - matchType: "=" - value: true - - - receiver: slack_bedrock_warning + + - receiver: slack_bedrock_critical continue: true matchers: - name: tier @@ -477,7 +475,7 @@ spec: value: vmware - name: severity matchType: "=" - value: warning + value: critical - name: region matchType: "=~" value: {{ without .Values.regions "qa-de-1" | join "|" }} @@ -487,9 +485,8 @@ spec: - name: bedrock matchType: "=" value: true - - # rework to match support group compute needed - - receiver: slack_vmware_info + + - receiver: slack_bedrock_warning continue: true matchers: - name: tier @@ -497,13 +494,16 @@ spec: value: vmware - name: severity matchType: "=" - value: info + value: warning - name: region matchType: "=~" value: {{ without .Values.regions "qa-de-1" | join "|" }} - name: support_group matchType: "=" value: compute + - name: bedrock + matchType: "=" + value: true # Test Channel for CC KVM Alerting slack_alert_kvm_test - receiver: slack_alert_kvm_test diff --git a/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl b/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl index 545c2c21cbb..3c9679eb1d4 100644 --- a/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl +++ b/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl @@ -5,6 +5,13 @@ vmware-{{ $vropshostname._0 | trimPrefix "vrops-" }} {{- end -}} +# Template around bedrock alerts +# Author: +# Description: +# The original alerting rule is then wrapped by a label_replace function. +# The label_replace adds a new label "bedrock" with the value "true" if the alert is relevant for bedrock. +# The label_replace adds a new label "bedrock" with the value "false" if the alert is not relevant for bedrock. +# The mappingKey is dynamically set within values.yaml for each alertname. {{- define "bedrockConfirm.expr" -}} {{- $expr := index . 0 -}} {{- $mappingKey := index . 1 -}} @@ -23,7 +30,7 @@ expr: > ) ) ), - "bedrock", "no", "bedrock", "") + "bedrock", "false", "bedrock", "") or label_replace( {{ $expr }} and on({{ $mappingKey }}) @@ -39,6 +46,6 @@ expr: > ) ) ), - "bedrock", "yes", "bedrock", "" + "bedrock", "true", "bedrock", "" ) {{- end -}} diff --git a/prometheus-rules/prometheus-vmware-rules/templates/prometheus-alerts.yaml b/prometheus-rules/prometheus-vmware-rules/templates/prometheus-alerts.yaml index b7350cf380c..60b17d2010a 100644 --- a/prometheus-rules/prometheus-vmware-rules/templates/prometheus-alerts.yaml +++ b/prometheus-rules/prometheus-vmware-rules/templates/prometheus-alerts.yaml @@ -1,7 +1,6 @@ {{- $root := . }} {{- $bedrockAlerts := .Values.bedrockAlerts }} {{- $filteredBedrockAlerts := dict }} - {{- range $key, $value := $bedrockAlerts }} {{- if $value }} {{- $filteredBedrockAlerts = merge $filteredBedrockAlerts (dict $key $value) }} diff --git a/prometheus-rules/prometheus-vmware-rules/values.yaml b/prometheus-rules/prometheus-vmware-rules/values.yaml index 78ac68accdf..0a2a53a6a14 100644 --- a/prometheus-rules/prometheus-vmware-rules/values.yaml +++ b/prometheus-rules/prometheus-vmware-rules/values.yaml @@ -4,3 +4,7 @@ owner-info: - Christopher Hans - Maximilian Lendrich helm-chart-url: https://github.com/sapcc/helm-charts/tree/master/prometheus-rules/prometheus-vmware-rules + +# Maintained on secrets.git/scaleout/s-qa-de-1/values/vmware-monitoring.yaml +# bedrockAlerts: +# DatastoreDisconnectedWithVmsOnIt: hostsystem \ No newline at end of file From 9038c3786e4c7bb4c00efbb9d51cbcbd6bcd1746 Mon Sep 17 00:00:00 2001 From: christophrichtersap Date: Thu, 16 Jan 2025 17:45:48 +0100 Subject: [PATCH 05/11] Update values.yaml --- prometheus-rules/prometheus-vmware-rules/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prometheus-rules/prometheus-vmware-rules/values.yaml b/prometheus-rules/prometheus-vmware-rules/values.yaml index 0a2a53a6a14..104a614d43b 100644 --- a/prometheus-rules/prometheus-vmware-rules/values.yaml +++ b/prometheus-rules/prometheus-vmware-rules/values.yaml @@ -7,4 +7,4 @@ owner-info: # Maintained on secrets.git/scaleout/s-qa-de-1/values/vmware-monitoring.yaml # bedrockAlerts: -# DatastoreDisconnectedWithVmsOnIt: hostsystem \ No newline at end of file +# DatastoreDisconnectedWithVmsOnIt: hostsystem From c24a10fe7c79b993080d9a272ba4ab3e081e0b96 Mon Sep 17 00:00:00 2001 From: christophrichtersap Date: Fri, 17 Jan 2025 11:44:39 +0100 Subject: [PATCH 06/11] Update prometheus-rules/prometheus-vmware-rules/values.yaml Co-authored-by: Richard Tief <56597015+richardtief@users.noreply.github.com> --- prometheus-rules/prometheus-vmware-rules/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prometheus-rules/prometheus-vmware-rules/values.yaml b/prometheus-rules/prometheus-vmware-rules/values.yaml index 104a614d43b..5fcf786474c 100644 --- a/prometheus-rules/prometheus-vmware-rules/values.yaml +++ b/prometheus-rules/prometheus-vmware-rules/values.yaml @@ -5,6 +5,6 @@ owner-info: - Maximilian Lendrich helm-chart-url: https://github.com/sapcc/helm-charts/tree/master/prometheus-rules/prometheus-vmware-rules -# Maintained on secrets.git/scaleout/s-qa-de-1/values/vmware-monitoring.yaml +# Maintained in the regional secrets # bedrockAlerts: # DatastoreDisconnectedWithVmsOnIt: hostsystem From 53a4000171a00545c8dd58e50189ac33ac3158e8 Mon Sep 17 00:00:00 2001 From: christophrichtersap Date: Fri, 17 Jan 2025 11:44:49 +0100 Subject: [PATCH 07/11] Update prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl Co-authored-by: Richard Tief <56597015+richardtief@users.noreply.github.com> --- .../templates/_helper.tpl | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl b/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl index 3c9679eb1d4..52a25962512 100644 --- a/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl +++ b/prometheus-rules/prometheus-vmware-rules/templates/_helper.tpl @@ -5,13 +5,15 @@ vmware-{{ $vropshostname._0 | trimPrefix "vrops-" }} {{- end -}} -# Template around bedrock alerts -# Author: -# Description: -# The original alerting rule is then wrapped by a label_replace function. -# The label_replace adds a new label "bedrock" with the value "true" if the alert is relevant for bedrock. -# The label_replace adds a new label "bedrock" with the value "false" if the alert is not relevant for bedrock. -# The mappingKey is dynamically set within values.yaml for each alertname. +{{- /* +Template around bedrock alerts +Author: +Description: + The original alerting rule is then wrapped by a label_replace function. + The label_replace adds a new label "bedrock" with the value "true" if the alert is relevant for bedrock. + The label_replace adds a new label "bedrock" with the value "false" if the alert is not relevant for bedrock. + The mappingKey is dynamically set within values.yaml for each alertname. +*/}} {{- define "bedrockConfirm.expr" -}} {{- $expr := index . 0 -}} {{- $mappingKey := index . 1 -}} From ac8686c024b63089ede519b542280e0d020fb1d0 Mon Sep 17 00:00:00 2001 From: I753292 Date: Mon, 20 Jan 2025 11:03:08 +0100 Subject: [PATCH 08/11] bumped version of charts.yaml --- global/prometheus-alertmanager-operated/Chart.yaml | 2 +- prometheus-rules/prometheus-vmware-rules/Chart.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/global/prometheus-alertmanager-operated/Chart.yaml b/global/prometheus-alertmanager-operated/Chart.yaml index 33fc571e242..1449d3316c7 100644 --- a/global/prometheus-alertmanager-operated/Chart.yaml +++ b/global/prometheus-alertmanager-operated/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 type: application description: Prometheus Alertmanager via operator. name: prometheus-alertmanager-operated -version: 4.5.2 +version: 4.5.3 dependencies: - alias: prometheus-alertmanager diff --git a/prometheus-rules/prometheus-vmware-rules/Chart.yaml b/prometheus-rules/prometheus-vmware-rules/Chart.yaml index 40bbf94b602..38cf928213c 100644 --- a/prometheus-rules/prometheus-vmware-rules/Chart.yaml +++ b/prometheus-rules/prometheus-vmware-rules/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 description: A collection of Prometheus alert rules. name: prometheus-vmware-rules -version: 1.0.9 +version: 1.0.10 dependencies: - name: owner-info repository: oci://keppel.eu-de-1.cloud.sap/ccloud-helm From d6fe82ec7f43acee1c011fae0ab10007a9e5d0c6 Mon Sep 17 00:00:00 2001 From: I753292 Date: Mon, 20 Jan 2025 14:16:57 +0100 Subject: [PATCH 09/11] bumped version of charts.yaml --- global/prometheus-alertmanager-operated/Chart.yaml | 2 +- prometheus-rules/prometheus-vmware-rules/Chart.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/global/prometheus-alertmanager-operated/Chart.yaml b/global/prometheus-alertmanager-operated/Chart.yaml index 1449d3316c7..dc9cb0980ac 100644 --- a/global/prometheus-alertmanager-operated/Chart.yaml +++ b/global/prometheus-alertmanager-operated/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 type: application description: Prometheus Alertmanager via operator. name: prometheus-alertmanager-operated -version: 4.5.3 +version: 4.6.0 dependencies: - alias: prometheus-alertmanager diff --git a/prometheus-rules/prometheus-vmware-rules/Chart.yaml b/prometheus-rules/prometheus-vmware-rules/Chart.yaml index 38cf928213c..1f267e70953 100644 --- a/prometheus-rules/prometheus-vmware-rules/Chart.yaml +++ b/prometheus-rules/prometheus-vmware-rules/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 description: A collection of Prometheus alert rules. name: prometheus-vmware-rules -version: 1.0.10 +version: 1.1.0 dependencies: - name: owner-info repository: oci://keppel.eu-de-1.cloud.sap/ccloud-helm From 678a22fe9af0dc9225c98df52bb51e4dafafa945 Mon Sep 17 00:00:00 2001 From: christophrichtersap Date: Mon, 20 Jan 2025 14:50:44 +0100 Subject: [PATCH 10/11] Update global/prometheus-alertmanager-operated/Chart.yaml Co-authored-by: Tommy Sauer --- global/prometheus-alertmanager-operated/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/global/prometheus-alertmanager-operated/Chart.yaml b/global/prometheus-alertmanager-operated/Chart.yaml index dc9cb0980ac..1449d3316c7 100644 --- a/global/prometheus-alertmanager-operated/Chart.yaml +++ b/global/prometheus-alertmanager-operated/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 type: application description: Prometheus Alertmanager via operator. name: prometheus-alertmanager-operated -version: 4.6.0 +version: 4.5.3 dependencies: - alias: prometheus-alertmanager From db8008b32662af9de8df205b68df93473e11c668 Mon Sep 17 00:00:00 2001 From: christophrichtersap Date: Sun, 2 Feb 2025 21:37:39 +0100 Subject: [PATCH 11/11] Update prometheus-alerts.yaml Added comment for linebreak and whitespace replace formatting --- .../templates/prometheus-alerts.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/prometheus-rules/prometheus-vmware-rules/templates/prometheus-alerts.yaml b/prometheus-rules/prometheus-vmware-rules/templates/prometheus-alerts.yaml index 60b17d2010a..1e60f71a1ec 100644 --- a/prometheus-rules/prometheus-vmware-rules/templates/prometheus-alerts.yaml +++ b/prometheus-rules/prometheus-vmware-rules/templates/prometheus-alerts.yaml @@ -19,11 +19,15 @@ metadata: spec: {{- $string := $bytes | toString }} +{{- /* +Replaces special whitespace characters to ensure correct YAML formatting. +Previously, improper indentation and line breaks caused alerts to be unmatched. +This fix applies replacements in place. +*/}} {{- $string := (regexReplaceAll "\\n\\s+\\n" $string "\n\n") }} {{- range $alert := splitList "\n\n" $string }} {{- $alertname := (regexReplaceAll "[\\s\\S]+?alert: (\\S+)\\n[\\s\\S]+" $alert "${1}") }} {{- if has $alertname (keys $filteredBedrockAlerts) }} -# Alert expr. templated with bedrock alerts template {{- $mappingKey := (printf "%s" (get $bedrockAlerts $alertname)) }} {{ regexReplaceAll "([\\s\\S]+?- alert: \\S+)\\n([\\s\\S]+?expr:[\\s\\S]+)" $alert "${1}" | indent 2 }} {{ regexReplaceAll "[\\s\\S]+expr.+?(>\\n|\\w|\\|\\n)\\s+([\\s\\S]+?)\\s+\\S+:[\\s\\S]+" $alert (include "bedrockConfirm.expr" (list "$2" $mappingKey)) | indent 6 }} @@ -33,4 +37,4 @@ spec: {{- end }} {{- end }} {{- end }} -{{- end }} \ No newline at end of file +{{- end }}