From 5ff41f5cd852ecb3d16cff9dc00c5e9496dfcb26 Mon Sep 17 00:00:00 2001 From: Robin Opletal <49439044+fourstepper@users.noreply.github.com> Date: Fri, 27 Dec 2024 22:56:10 +0100 Subject: [PATCH] feat(alerting): magic alerting severities (#127) * feat(alerting): magic alerting hocus pocus Signed-off-by: Hy3n4 * feat(alert-severities): config like this? Signed-off-by: Robin Opletal * fix(alert-severities): remove NoSlo severity Signed-off-by: Robin Opletal * stylistic fixes Signed-off-by: Robin Opletal * fix(config): nicer Cfg initialization Signed-off-by: Robin Opletal * stop hardcoding 1-target in alertmanagerrule Signed-off-by: Robin Opletal * feat(make): make wait for services better when deploying dev infratructure Signed-off-by: Hy3n4 * feat(alerting): alertmanager config loader edits (#130) * feat(alerting): alertmanager config loader Signed-off-by: Robin Opletal * generated stuff... Signed-off-by: Robin Opletal --------- Signed-off-by: Robin Opletal * chore: add release-drafter gh action (#133) Signed-off-by: Jose Santorum * feat(magic): some features some fixes mainly fixed duration issues also, added some feature to devel, osko dashboards, kustomize for asier deployment and other cool stuff modified function responsible for creating the alerting rule when magiAlerting is enabled, also added some basic mapping for opsgenie, pagerduty and custom alerting tool. Currently not working as expected tho Signed-off-by: Hy3n4 * fix(rules): duplicated rules for 5m window Signed-off-by: Hy3n4 * prometheus helper: simplify unique windows, resolve finalizer API warnings Signed-off-by: Robin Opletal * at least move uniqueStrings to function Signed-off-by: Robin Opletal --------- Signed-off-by: Hy3n4 Signed-off-by: Robin Opletal Signed-off-by: Jose Santorum Co-authored-by: Hy3n4 Co-authored-by: Jose Santorum --- Makefile | 2 +- api/osko/v1alpha1/mimirrule_types.go | 15 +- api/osko/v1alpha1/zz_generated.deepcopy.go | 6 + cmd/main.go | 4 +- config/crd/bases/osko.dev_mimirrules.yaml | 10 +- config/samples/kustomization.yaml | 5 +- config/samples/openslo_v1_datasource.yaml | 2 - config/samples/openslo_v1_slo.yaml | 28 +- devel/grafana-agent/configmap.yaml | 11 +- devel/grafana-agent/deployment.yaml | 13 +- devel/grafana/osko-1.json | 255 ++++++++++ devel/grafana/osko-2.json | 464 ++++++++++++++++++ devel/kustomization.yaml | 10 + internal/config/config.go | 11 +- internal/config/types.go | 33 ++ internal/config/utils.go | 29 ++ .../controller/osko/mimirrule_controller.go | 15 +- internal/helpers/mimirtool_helper.go | 12 +- internal/helpers/prometheus_helper.go | 131 ++++- 19 files changed, 981 insertions(+), 75 deletions(-) create mode 100644 devel/grafana/osko-1.json create mode 100644 devel/grafana/osko-2.json create mode 100644 devel/kustomization.yaml diff --git a/Makefile b/Makefile index acf2631..9557ae6 100644 --- a/Makefile +++ b/Makefile @@ -168,7 +168,7 @@ $(ENVTEST): $(LOCALBIN) .PHONY: deploydev deploydev: - @$(KUBECTL) apply -R -f devel/ + @$(KUBECTL) apply -k devel/ @echo "Waiting for services to come online for the port-forwards..." @until [ "$$($(KUBECTL) get pods -l app=grafana -o jsonpath='{.items}')}" != "[]" ] && \ [ "$$($(KUBECTL) get pods -l app=grafana -o jsonpath='{.items[0].status.containerStatuses[0].ready}')" == "true" ]; do \ diff --git a/api/osko/v1alpha1/mimirrule_types.go b/api/osko/v1alpha1/mimirrule_types.go index 73fbb28..00af9ef 100644 --- a/api/osko/v1alpha1/mimirrule_types.go +++ b/api/osko/v1alpha1/mimirrule_types.go @@ -1,6 +1,7 @@ package v1alpha1 import ( + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/prometheus/common/model" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -33,13 +34,13 @@ type RuleGroup struct { } type Rule struct { - Record string `json:"record,omitempty"` - Alert string `json:"alert,omitempty"` - Expr string `json:"expr"` - For model.Duration `json:"for,omitempty"` - KeepFiringFor model.Duration `json:"keep_firing_for,omitempty"` - Labels map[string]string `json:"labels,omitempty"` - Annotations map[string]string `json:"annotations,omitempty"` + Record string `json:"record,omitempty"` + Alert string `json:"alert,omitempty"` + Expr string `json:"expr"` + For *monitoringv1.Duration `json:"for,omitempty"` + KeepFiringFor model.Duration `json:"keep_firing_for,omitempty"` + Labels map[string]string `json:"labels,omitempty"` + Annotations map[string]string `json:"annotations,omitempty"` } //+kubebuilder:object:root=true diff --git a/api/osko/v1alpha1/zz_generated.deepcopy.go b/api/osko/v1alpha1/zz_generated.deepcopy.go index be9c608..7f6354d 100644 --- a/api/osko/v1alpha1/zz_generated.deepcopy.go +++ b/api/osko/v1alpha1/zz_generated.deepcopy.go @@ -5,6 +5,7 @@ package v1alpha1 import ( + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" "github.com/prometheus/common/model" "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" @@ -290,6 +291,11 @@ func (in *Multitenancy) DeepCopy() *Multitenancy { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Rule) DeepCopyInto(out *Rule) { *out = *in + if in.For != nil { + in, out := &in.For, &out.For + *out = new(monitoringv1.Duration) + **out = **in + } if in.Labels != nil { in, out := &in.Labels, &out.Labels *out = make(map[string]string, len(*in)) diff --git a/cmd/main.go b/cmd/main.go index d32affe..82d8a84 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -57,7 +57,7 @@ func main() { opts.BindFlags(flag.CommandLine) flag.Parse() - cfg := config.NewConfig() + config.NewConfig() ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) @@ -141,7 +141,7 @@ func main() { Client: mgr.GetClient(), Scheme: mgr.GetScheme(), Recorder: mgr.GetEventRecorderFor("mimirrule-controller"), - RequeueAfterPeriod: cfg.MimirRuleRequeuePeriod, + RequeueAfterPeriod: config.Cfg.MimirRuleRequeuePeriod, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "MimirRule") os.Exit(1) diff --git a/config/crd/bases/osko.dev_mimirrules.yaml b/config/crd/bases/osko.dev_mimirrules.yaml index afc513f..38c32fe 100644 --- a/config/crd/bases/osko.dev_mimirrules.yaml +++ b/config/crd/bases/osko.dev_mimirrules.yaml @@ -85,11 +85,11 @@ spec: type: string for: description: |- - Duration wraps time.Duration. It is used to parse the custom duration format - from YAML. - This type should not propagate beyond the scope of input/output processing. - format: int64 - type: integer + Duration is a valid time duration that can be parsed by Prometheus model.ParseDuration() function. + Supported units: y, w, d, h, m, s, ms + Examples: `30s`, `1m`, `1h20m15s`, `15d` + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string keep_firing_for: description: |- Duration wraps time.Duration. It is used to parse the custom duration format diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml index ae65df4..6d9fad7 100644 --- a/config/samples/kustomization.yaml +++ b/config/samples/kustomization.yaml @@ -1,4 +1,7 @@ ## Append samples of your project ## resources: -- osko_v1alpha1_alertmanagerconfig.yaml + - openslo_v1_datasource.yaml + - openslo_v1_slo.yaml + - config_secret.yaml + - osko_v1alpha1_alertmanagerconfig.yaml # +kubebuilder:scaffold:manifestskustomizesamples diff --git a/config/samples/openslo_v1_datasource.yaml b/config/samples/openslo_v1_datasource.yaml index e3688f4..27e6d03 100644 --- a/config/samples/openslo_v1_datasource.yaml +++ b/config/samples/openslo_v1_datasource.yaml @@ -8,10 +8,8 @@ spec: description: Mimir Datasource for logging tenant type: mimir connectionDetails: - address: https://mimir.monitoring.dev.heu.group/ address: http://localhost:9009/ sourceTenants: - gatekeeper-system - targetTenant: gatekeeper-system - monitoring targetTenant: monitoring diff --git a/config/samples/openslo_v1_slo.yaml b/config/samples/openslo_v1_slo.yaml index 07e226c..14c06e4 100644 --- a/config/samples/openslo_v1_slo.yaml +++ b/config/samples/openslo_v1_slo.yaml @@ -1,39 +1,39 @@ apiVersion: openslo.com/v1 kind: SLO metadata: + name: mimir-ingestion-latency labels: - label.osko.dev/team: "infrastructure" - label.osko.dev/system: "gatekeeper" - label.osko.dev/domain: "security" + label.osko.dev/team: "infra" + label.osko.dev/system: "monitoring" + label.osko.dev/domain: "observability" + label.osko.dev/service: "mimir" annotations: osko.dev/datasourceRef: "mimir-infra-ds" osko.dev/magicAlerting: "true" - name: gatekeeper-webhook-response-time spec: budgetingMethod: Occurrences - description: 99% of Gatekeeper webhook requests return in less than 0.5s + description: 95% of all queries should have a latency of less than 300 milliseconds indicator: metadata: - name: gatekeeper-webhook-less-than-05s + name: distributor-query-success-latency spec: - description: 99% of Gatekeeper webhook requests return in less than 0.5s + description: 95% of all queries should have a latency of less than 500 milliseconds ratioMetric: good: metricSource: metricSourceRef: mimir-infra-ds type: Mimir spec: - query: controller_runtime_webhook_latency_seconds_bucket{le="0.5", job="gatekeeper-metrics"} + query: cortex_distributor_query_duration_seconds_bucket{le="0.5", method="Distributor.QueryStream", status_code="200"} total: metricSource: metricSourceRef: mimir-infra-ds type: Mimir spec: - query: controller_runtime_webhook_latency_seconds_count{job="gatekeeper-metrics"} + query: cortex_distributor_query_duration_seconds_count{method="Distributor.QueryStream"} objectives: - - displayName: gatekeeper-webhook-less-than-05s - target: '0.99' - service: testing + - target: "0.99" + service: mimir timeWindow: - - duration: 28d - isRolling: true + - duration: 28d + isRolling: true diff --git a/devel/grafana-agent/configmap.yaml b/devel/grafana-agent/configmap.yaml index e61cc67..398da1b 100644 --- a/devel/grafana-agent/configmap.yaml +++ b/devel/grafana-agent/configmap.yaml @@ -12,10 +12,19 @@ data: } } } - prometheus.scrape "static" { + prometheus.relabel "cluster" { + rule { + target_label = "cluster" + replacement = "local" + } forward_to = [ prometheus.remote_write.local.receiver, ] + } + prometheus.scrape "static" { + forward_to = [ + prometheus.relabel.cluster.receiver, + ] targets = [ { "__address__" = "mimir-service:9009", diff --git a/devel/grafana-agent/deployment.yaml b/devel/grafana-agent/deployment.yaml index 6c2d637..122ebf9 100644 --- a/devel/grafana-agent/deployment.yaml +++ b/devel/grafana-agent/deployment.yaml @@ -16,11 +16,11 @@ spec: - name: grafana-agent image: grafana/agent:latest args: - - 'run' - - '/etc/agent/agent.river' - - '--storage.path=/tmp/agent' - - '--server.http.listen-addr=127.0.0.1:80' - - '--server.http.ui-path-prefix=/' + - "run" + - "/etc/agent/agent.river" + - "--storage.path=/tmp/agent" + - "--server.http.listen-addr=127.0.0.1:12345" + - "--server.http.ui-path-prefix=/" volumeMounts: - name: config-volume mountPath: /etc/agent @@ -31,6 +31,9 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + ports: + - containerPort: 12345 + name: http-agent volumes: - name: config-volume diff --git a/devel/grafana/osko-1.json b/devel/grafana/osko-1.json new file mode 100644 index 0000000..dacc094 --- /dev/null +++ b/devel/grafana/osko-1.json @@ -0,0 +1,255 @@ +{ + "description": "Overview of systems", + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [], + "title": "Status of systems owned by $team", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bool_yes_no" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 2, + "links": [ + { + "url": "http://localhost:3000/d/osko-slos/osko-slos?orgId=1&${system:queryparam}&${team:queryparam}&${window:queryparam}&${env:queryparam}&${domain:queryparam}" + } + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v10.4.0", + "repeat": "system", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$env" + }, + "expr": "min(\n sum(\n osko_sli_measurement{domain=~\"$domain\", system=~\"$system\", team=~\"$team\", window=~\"$window\"}\n ) by (domain, system)\n > bool\n sum(\n osko_slo_target{domain=~\"$domain\", system=~\"$system\", team=~\"$team\", window=~\"$window\"}\n ) by (domain, system)\n) by (system)\n" + } + ], + "title": "$system", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 3, + "panels": [], + "repeat": "system", + "title": "Lowest SLI of $system", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "decimals": 4, + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 0.98999999999999999 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 4, + "options": { + "displayMode": "basic", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "showUnfilled": true, + "valueMode": "color" + }, + "pluginVersion": "v10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$env" + }, + "expr": "min(\n osko_sli_measurement{domain=~\"$domain\", system=~\"$system\", team=~\"$team\", window=~\"$window\"}\n) by (system)\n" + } + ], + "title": "$system", + "type": "bargauge" + } + ], + "schemaVersion": 36, + "templating": { + "list": [ + { + "description": "Environment", + "label": "Env", + "name": "env", + "query": "prometheus", + "regex": "mimir-.*", + "type": "datasource" + }, + { + "current": { + "selected": false, + "text": "28d", + "value": "28d" + }, + "datasource": { + "type": "prometheus", + "uid": "${env}" + }, + "description": "Window", + "label": "Window", + "name": "window", + "query": "label_values(osko_slo_target, window)", + "refresh": 2, + "type": "query" + }, + { + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${env}" + }, + "description": "Team", + "includeAll": true, + "label": "Team", + "multi": true, + "name": "team", + "query": "label_values(osko_slo_target, team)", + "refresh": 2, + "type": "query" + }, + { + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${env}" + }, + "description": "Domain", + "includeAll": true, + "label": "Domain", + "multi": true, + "name": "domain", + "query": "label_values(osko_slo_target{team=~\"$team\"}, domain)", + "refresh": 2, + "type": "query" + }, + { + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${env}" + }, + "description": "System", + "includeAll": true, + "label": "System", + "multi": true, + "name": "system", + "query": "label_values(osko_slo_target{team=~\"$team\", domain=~\"$domain\"}, system)", + "refresh": 2, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "OSKO / Systems overview", + "uid": "osko-systems-overview" +} diff --git a/devel/grafana/osko-2.json b/devel/grafana/osko-2.json new file mode 100644 index 0000000..c72c432 --- /dev/null +++ b/devel/grafana/osko-2.json @@ -0,0 +1,464 @@ +{ + "description": "Look at specific SLOs", + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [ ], + "repeat": "slo", + "title": "$slo SLO", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bool_yes_no" + } + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$env" + }, + "expr": "sum(\n osko_sli_measurement{domain=~\"$domain\", system=~\"$system\", team=~\"$team\", service=~\"$service\", slo_name=~\"$slo\", window=\"$window\"}\n) by (slo_name)\n> bool\nsum(\n osko_slo_target{domain=~\"$domain\", system=~\"$system\", team=~\"$team\", service=~\"$service\", slo_name=~\"$slo\", window=\"$window\"}\n) by (slo_name)\n" + } + ], + "title": "Passing", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "showPoints": "never" + }, + "decimals": 3, + "max": 1, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 3, + "pluginVersion": "v10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$env" + }, + "expr": "min(\n osko_sli_measurement{domain=~\"$domain\", system=~\"$system\", team=~\"$team\", service=~\"$service\", slo_name=~\"$slo\", window=\"$window\"}\n) by (slo_name)\n", + "legendFormat": "Measurement" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$env" + }, + "expr": "min(\n osko_slo_target{domain=~\"$domain\", system=~\"$system\", team=~\"$team\", service=~\"$service\", slo_name=~\"$slo\", window=\"$window\"}\n) by (slo_name)\n", + "legendFormat": "Target" + } + ], + "title": "SLI vs SLO", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 8, + "y": 1 + }, + "id": 4, + "pluginVersion": "v10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$env" + }, + "expr": "min(\n osko_error_budget_burn_rate{domain=~\"$domain\", system=~\"$system\", team=~\"$team\", service=~\"$service\", slo_name=~\"$slo\", window=\"$window\"}\n) by (slo_name)\n", + "legendFormat": "Burn rate {{ window }}" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$env" + }, + "expr": "vector(1)\n", + "legendFormat": "Threshold {{ window }}" + } + ], + "title": "Error Budget burn rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "purple", + "mode": "fixed" + }, + "thresholds": { + "mode": "absolute" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 12, + "y": 1 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$env" + }, + "expr": "sum(osko_sli_total{domain=~\"$domain\", system=~\"$system\", team=~\"$team\", service=~\"$service\", slo_name=~\"$slo\", window=\"$window\"})\n" + } + ], + "title": "Total events", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$env" + }, + "expr": "sum(\n osko_error_budget_target{domain=~\"$domain\", system=~\"$system\", team=~\"$team\", service=~\"$service\", slo_name=~\"$slo\", window=\"$window\"}\n *\n osko_sli_total{domain=~\"$domain\", system=~\"$system\", team=~\"$team\", service=~\"$service\", slo_name=~\"$slo\", window=\"$window\"}\n)\n", + "legendFormat": "Budget" + } + ], + "title": "Errors budget", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "pluginVersion": "v10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$env" + }, + "expr": "sum(\n (\n osko_error_budget_target{domain=~\"$domain\", system=~\"$system\", team=~\"$team\", service=~\"$service\", slo_name=~\"$slo\", window=\"$window\"}\n - osko_error_budget_available{domain=~\"$domain\", system=~\"$system\", team=~\"$team\", service=~\"$service\", slo_name=~\"$slo\", window=\"$window\"}\n ) *\n osko_sli_total{domain=~\"$domain\", system=~\"$system\", team=~\"$team\", service=~\"$service\", slo_name=~\"$slo\", window=\"$window\"}\n)\n", + "legendFormat": "Remaining" + } + ], + "title": "Errors remaining", + "type": "stat" + } + ], + "schemaVersion": 36, + "tags": [ + "osko" + ], + "templating": { + "list": [ + { + "description": "Environment", + "label": "Env", + "name": "env", + "query": "prometheus", + "regex": "mimir-.*", + "type": "datasource" + }, + { + "current": { + "selected": false, + "text": "28d", + "value": "28d" + }, + "datasource": { + "type": "prometheus", + "uid": "${env}" + }, + "description": "Window", + "label": "Window", + "name": "window", + "query": "label_values(osko_slo_target, window)", + "refresh": 2, + "type": "query" + }, + { + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${env}" + }, + "description": "Team", + "includeAll": true, + "label": "Team", + "multi": true, + "name": "team", + "query": "label_values(osko_slo_target, team)", + "refresh": 2, + "type": "query" + }, + { + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${env}" + }, + "description": "Domain", + "includeAll": true, + "label": "Domain", + "multi": true, + "name": "domain", + "query": "label_values(osko_slo_target{team=~\"$team\"}, domain)", + "refresh": 2, + "type": "query" + }, + { + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${env}" + }, + "description": "System", + "includeAll": true, + "label": "System", + "multi": true, + "name": "system", + "query": "label_values(osko_slo_target{team=~\"$team\", domain=~\"$domain\"}, system)", + "refresh": 2, + "type": "query" + }, + { + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${env}" + }, + "description": "Service", + "includeAll": true, + "label": "Service", + "multi": true, + "name": "service", + "query": "label_values(osko_slo_target{team=~\"$team\", domain=~\"$domain\", system=~\"$system\"}, service)", + "refresh": 2, + "type": "query" + }, + { + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${env}" + }, + "description": "SLO", + "includeAll": true, + "label": "SLO", + "multi": true, + "name": "slo", + "query": "label_values(osko_slo_target{team=~\"$team\", domain=~\"$domain\", system=~\"$system\", service=~\"$service\"}, slo_name)", + "refresh": 2, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "OSKO / SLOs", + "uid": "osko-slos" +} diff --git a/devel/kustomization.yaml b/devel/kustomization.yaml new file mode 100644 index 0000000..b69aa51 --- /dev/null +++ b/devel/kustomization.yaml @@ -0,0 +1,10 @@ +resources: + - grafana/deployment.yaml + - grafana/service.yaml + - grafana/mimir-datasource.yaml + - mimir/deployment.yaml + - mimir/service.yaml + - mimir/configmap.yaml + - mimir/alertmanager-default-config.yaml + - grafana-agent/deployment.yaml + - grafana-agent/configmap.yaml diff --git a/internal/config/config.go b/internal/config/config.go index 65ec485..62dfd85 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -4,8 +4,12 @@ import ( "time" ) -func NewConfig() Config { - config := Config{ +var Cfg Config + +func NewConfig() { + alertingTool := GetEnv("OSKO_ALERTING_TOOL", "opsgenie") + + Cfg = Config{ MimirRuleRequeuePeriod: GetEnvAsDuration("MIMIR_RULE_REQUEUE_PERIOD", 60*time.Second), AlertingBurnRates: AlertingBurnRates{ PageShortWindow: GetEnvAsFloat64("ABR_PAGE_SHORT_WINDOW", 14.4), @@ -14,6 +18,7 @@ func NewConfig() Config { TicketLongWindow: GetEnvAsFloat64("ABR_TICKET_LONG_WINDOW", 1), }, DefaultBaseWindow: GetEnvAsDuration("DEFAULT_BASE_WINDOW", 5*time.Minute), + AlertingTool: alertingTool, + // AlertSeverities: AlertSeveritiesByTool(alertingTool), // I wouldn't default to opsgenie here, maybe better to default to custom and error on startup if no custom variables or valid tool is selected } - return config } diff --git a/internal/config/types.go b/internal/config/types.go index b4a3ea0..76ef508 100644 --- a/internal/config/types.go +++ b/internal/config/types.go @@ -8,6 +8,8 @@ type Config struct { MimirRuleRequeuePeriod time.Duration AlertingBurnRates AlertingBurnRates DefaultBaseWindow time.Duration + AlertingTool string + AlertSeverities AlertSeverities } type AlertingBurnRates struct { @@ -16,3 +18,34 @@ type AlertingBurnRates struct { TicketShortWindow float64 TicketLongWindow float64 } + +type AlertToolConfig struct { + Tool string + Severities map[string]string +} + +type SREAlertSeverity string + +const ( + PageCritical SREAlertSeverity = "page_critical" + PageHigh SREAlertSeverity = "page_high" + TicketHigh SREAlertSeverity = "ticket_high" + TicketMedium SREAlertSeverity = "ticket_medium" +) + +type AlertToolSeverityMap map[SREAlertSeverity]string + +type AlertSeverities struct { + Critical string + HighFast string + HighSlow string + Low string + Tool string +} + +func (m AlertToolSeverityMap) GetSeverity(sreSeverity SREAlertSeverity) string { + if sev, ok := m[sreSeverity]; ok { + return sev + } + return m[TicketMedium] // default to lowest severity +} diff --git a/internal/config/utils.go b/internal/config/utils.go index 7af0aaf..70ffb03 100644 --- a/internal/config/utils.go +++ b/internal/config/utils.go @@ -43,3 +43,32 @@ func GetEnvAsDuration(key string, defaultValue time.Duration) time.Duration { } return defaultValue } + +func AlertSeveritiesByTool(tool string) AlertToolSeverityMap { + severityMaps := map[string]AlertToolSeverityMap{ + "opsgenie": { + PageCritical: "P1", + PageHigh: "P2", + TicketHigh: "P3", + TicketMedium: "P4", + }, + "pagerduty": { + PageCritical: "SEV_1", + PageHigh: "SEV_2", + TicketHigh: "SEV_3", + TicketMedium: "SEV_4", + }, + "custom": { + PageCritical: GetEnv("OSKO_ALERTING_SEVERITY_CRITICAL", "critical"), + PageHigh: GetEnv("OSKO_ALERTING_SEVERITY_HIGH", "high"), + TicketHigh: GetEnv("OSKO_ALERTING_SEVERITY_HIGH", "medium"), + TicketMedium: GetEnv("OSKO_ALERTING_SEVERITY_LOW", "low"), + }, + } + + if toolMap, exists := severityMaps[tool]; exists { + return toolMap + } + + return severityMaps["custom"] +} diff --git a/internal/controller/osko/mimirrule_controller.go b/internal/controller/osko/mimirrule_controller.go index 6cec190..c526acc 100644 --- a/internal/controller/osko/mimirrule_controller.go +++ b/internal/controller/osko/mimirrule_controller.go @@ -12,6 +12,7 @@ import ( openslov1 "github.com/oskoperator/osko/api/openslo/v1" "github.com/oskoperator/osko/internal/helpers" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/prometheus/common/model" "github.com/prometheus/prometheus/model/rulefmt" "gopkg.in/yaml.v3" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -37,8 +38,8 @@ type MimirRuleReconciler struct { const ( mimirRuleNamespace = "osko" - mimirRuleFinalizer = "finalizer.mimir.osko.dev" - prometheusRuleFinalizer = "finalizer.prometheusrule.osko.dev" + mimirRuleFinalizer = "finalizer.osko.dev/mimir" + prometheusRuleFinalizer = "finalizer.osko.dev/prometheusrule" errFinalizerAddFailed = "Failed to add the finalizer to the" errFinalizerRemoveFailed = "Failed to remove the finalizer from the" @@ -243,6 +244,14 @@ func (r *MimirRuleReconciler) createMimirRuleGroupAPI(log logr.Logger, rule *osk Labels: r.Labels, } } else { + var modelDuration model.Duration + if r.For != nil { + parsedDuration, err := model.ParseDuration(string(*r.For)) + if err != nil { + return nil + } + modelDuration = parsedDuration + } mimirRuleNode = rulefmt.RuleNode{ Alert: yaml.Node{ Kind: 8, @@ -252,7 +261,7 @@ func (r *MimirRuleReconciler) createMimirRuleGroupAPI(log logr.Logger, rule *osk Kind: 8, Value: r.Expr, }, - For: r.For, + For: modelDuration, Labels: r.Labels, } } diff --git a/internal/helpers/mimirtool_helper.go b/internal/helpers/mimirtool_helper.go index 794d4b6..353237b 100644 --- a/internal/helpers/mimirtool_helper.go +++ b/internal/helpers/mimirtool_helper.go @@ -2,15 +2,15 @@ package helpers import ( "context" + "reflect" + "github.com/go-logr/logr" mimirclient "github.com/grafana/mimir/pkg/mimirtool/client" "github.com/grafana/mimir/pkg/mimirtool/rules/rwrulefmt" openslov1 "github.com/oskoperator/osko/api/openslo/v1" oskov1alpha1 "github.com/oskoperator/osko/api/osko/v1alpha1" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - "github.com/prometheus/common/model" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "reflect" ) const ( @@ -73,16 +73,10 @@ func NewMimirRuleGroups(rule *monitoringv1.PrometheusRule, connectionDetails *os for _, r := range group.Rules { if r.Record == "" && r.Alert != "" { - - duration, err := model.ParseDuration(string(*r.For)) - if err != nil { - return nil, err - } - mimirRuleNode = oskov1alpha1.Rule{ Alert: r.Alert, Expr: r.Expr.String(), - For: duration, + For: r.For, Labels: r.Labels, } } else { diff --git a/internal/helpers/prometheus_helper.go b/internal/helpers/prometheus_helper.go index 600119c..6450f43 100644 --- a/internal/helpers/prometheus_helper.go +++ b/internal/helpers/prometheus_helper.go @@ -12,6 +12,7 @@ import ( openslov1 "github.com/oskoperator/osko/api/openslo/v1" "github.com/oskoperator/osko/internal/config" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "github.com/prometheus/common/model" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" ctrllog "sigs.k8s.io/controller-runtime/pkg/log" @@ -108,6 +109,19 @@ func mergeLabels(ms ...map[string]string) map[string]string { return labels } +func uniqueStrings(input []string) []string { + seen := make(map[string]bool) + result := []string{} + + for _, v := range input { + if !seen[v] { + seen[v] = true + result = append(result, v) + } + } + return result +} + func (mrs *MonitoringRuleSet) createBaseRuleLabels(window string) map[string]string { return map[string]string{ "namespace": mrs.Slo.Namespace, @@ -234,10 +248,11 @@ func (mrs *MonitoringRuleSet) createRecordingRule(metric, recordName, window str // SetupRules constructs rule groups for monitoring based on SLO and SLI configurations. func (mrs *MonitoringRuleSet) SetupRules() ([]monitoringv1.RuleGroup, error) { - //log := ctrllog.FromContext(context.Background()) + log := ctrllog.FromContext(context.Background()) baseWindow := mrs.BaseWindow //Should configurable somewhere as agreed on product workshop - extendedWindow := "28d" //Default to 28d if not specified in the SLO + log.V(1).Info("Starting SetupRules", "baseWindow", baseWindow) + extendedWindow := "28d" //Default to 28d if not specified in the SLO if len(mrs.Slo.Spec.TimeWindow) > 0 && mrs.Slo.Spec.TimeWindow[0].Duration != "" { extendedWindow = string(mrs.Slo.Spec.TimeWindow[0].Duration) @@ -259,6 +274,8 @@ func (mrs *MonitoringRuleSet) SetupRules() ([]monitoringv1.RuleGroup, error) { } windows := []string{baseWindow, extendedWindow, "5m", "30m", "1h", "2h", "6h", "24h", "3d"} + windows = uniqueStrings(windows) + var alertRuleErrorBudgets []monitoringv1.Rule // BASE WINDOW @@ -280,12 +297,19 @@ func (mrs *MonitoringRuleSet) SetupRules() ([]monitoringv1.RuleGroup, error) { rules["errorBudgetValue"][baseWindow] = mrs.createErrorBudgetValueRecordingRule(rules["sliMeasurement"][baseWindow], baseWindow) rules["errorBudgetTarget"][baseWindow] = mrs.createErrorBudgetTargetRecordingRule(baseWindow) rules["burnRate"][baseWindow] = mrs.createBurnRateRecordingRule(rules["errorBudgetValue"][baseWindow], rules["errorBudgetTarget"][baseWindow], baseWindow) + if baseWindow == "5m" || baseWindow == "30m" || baseWindow == "1h" || baseWindow == "6h" || + baseWindow == "24h" || baseWindow == "3d" || baseWindow == "2h" { + alertRuleErrorBudgets = append(alertRuleErrorBudgets, rules["burnRate"][baseWindow]) + } for _, window := range windows { if window == baseWindow { + log.V(1).Info("Skipping base window in loop", "window", window) continue } - // rules["targetRule"][window] = mrs.createRecordingRule(mrs.Slo.Spec.Objectives[0].Target, "slo_target", window, true) + + log.V(1).Info("Processing window", "window", window) + rules["totalRule"][window] = mrs.createRecordingRule(rules["totalRule"][baseWindow].Record, "sli_total", window, true) if mrs.Sli.Spec.RatioMetric.Good.MetricSource.Spec.Query != "" { @@ -303,13 +327,33 @@ func (mrs *MonitoringRuleSet) SetupRules() ([]monitoringv1.RuleGroup, error) { rules["errorBudgetValue"][window] = mrs.createErrorBudgetValueRecordingRule(rules["sliMeasurement"][window], window) rules["errorBudgetTarget"][window] = mrs.createErrorBudgetTargetRecordingRule(window) rules["burnRate"][window] = mrs.createBurnRateRecordingRule(rules["errorBudgetValue"][window], rules["errorBudgetTarget"][window], window) + if window == "5m" || window == "30m" || window == "1h" || window == "6h" || window == "24h" || window == "3d" || window == "2h" { + log.V(1).Info("Adding burn rate rule", "window", window) alertRuleErrorBudgets = append(alertRuleErrorBudgets, rules["burnRate"][window]) } } + log.V(1).Info("Final burn rates collection", + "count", len(alertRuleErrorBudgets), + "windows", func() []string { + windows := make([]string, 0) + for _, r := range alertRuleErrorBudgets { + windows = append(windows, r.Labels["window"]) + } + return windows + }()) + rulesByType := make(map[string][]monitoringv1.Rule) + + if rule, exists := rules["targetRule"][baseWindow]; exists { + rulesByType["targetRule"] = []monitoringv1.Rule{rule} + } + for ruleKey, nestedMap := range rules { + if ruleKey == "targetRule" { + continue + } for _, window := range windows { if rule, exists := nestedMap[window]; exists { rulesByType[ruleKey] = append(rulesByType[ruleKey], rule) @@ -327,22 +371,52 @@ func (mrs *MonitoringRuleSet) SetupRules() ([]monitoringv1.RuleGroup, error) { {Name: fmt.Sprintf("%s_burn_rate", sloName), Rules: rulesByType["burnRate"]}, } + // TODO: having the magic alerting we still need to figure out how to pass the severity into the alerting rule when we have more than one alerting tool. + // The idea is to have a map of the alerting tool and the severity and then iterate over all connected AlertNotificationTargets. + log.V(1).Info("Magic alerting", "SLO", sloName, "enabled", mrs.Slo.ObjectMeta.Annotations["osko.dev/magicAlerting"]) if mrs.Slo.ObjectMeta.Annotations["osko.dev/magicAlerting"] == "true" { duration := monitoringv1.Duration("5m") var alertRules []monitoringv1.Rule - alertRules = append(alertRules, mrs.createMagicMultiBurnRateAlert(alertRuleErrorBudgets, "0.001", &duration, "page")) - alertRules = append(alertRules, mrs.createMagicMultiBurnRateAlert(alertRuleErrorBudgets, "0.001", &duration, "ticket")) + + alertRules = append(alertRules, + mrs.createMagicMultiBurnRateAlert( + alertRuleErrorBudgets, + fmt.Sprintf("1-%s", mrs.Slo.Spec.Objectives[0].Target), + &duration, + config.PageCritical, // Fast burn rate, short window + ), + mrs.createMagicMultiBurnRateAlert( + alertRuleErrorBudgets, + fmt.Sprintf("1-%s", mrs.Slo.Spec.Objectives[0].Target), + &duration, + config.PageHigh, // Fast burn rate, long window + ), + mrs.createMagicMultiBurnRateAlert( + alertRuleErrorBudgets, + fmt.Sprintf("1-%s", mrs.Slo.Spec.Objectives[0].Target), + &duration, + config.TicketHigh, // Slow burn rate, short window + ), + mrs.createMagicMultiBurnRateAlert( + alertRuleErrorBudgets, + fmt.Sprintf("1-%s", mrs.Slo.Spec.Objectives[0].Target), + &duration, + config.TicketMedium, // Slow burn rate, long window + ), + ) + + // log.V(1).Info("Alerting rules to be created", "alertRules", alertRules) ruleGroups = append(ruleGroups, monitoringv1.RuleGroup{ - Name: fmt.Sprintf("%s_slo_alert", sloName), Rules: alertRules, + Name: fmt.Sprintf("%s_slo_alert", sloName), + Rules: alertRules, }) } return ruleGroups, nil } // createMagicMultiBurnRateAlert creates a Prometheus alert rule for multi-burn rate alerting. -func (mrs *MonitoringRuleSet) createMagicMultiBurnRateAlert(burnRates []monitoringv1.Rule, threshold string, duration *monitoringv1.Duration, severity string) monitoringv1.Rule { +func (mrs *MonitoringRuleSet) createMagicMultiBurnRateAlert(burnRates []monitoringv1.Rule, threshold string, duration *monitoringv1.Duration, sreSeverity config.SREAlertSeverity) monitoringv1.Rule { log := ctrllog.FromContext(context.Background()) - cfg := config.NewConfig() alertingPageWindowsOrder := []string{"1h", "5m", "6h", "30m", "24h", "2h", "3d"} @@ -363,37 +437,51 @@ func (mrs *MonitoringRuleSet) createMagicMultiBurnRateAlert(burnRates []monitori } } + //TODO: Create severity mapping between alerting tool and SRE book severity + // Define the alert expressions for different severities and durations var alertExpression string - if severity == "page" { + switch sreSeverity { + case config.PageCritical, config.PageHigh: alertExpression = fmt.Sprintf( "(%s{%s} > (%.1f * %s) and %s{%s} > (%.1f * %s)) or (%s{%s} > (%.1f * %s) and %s{%s} > (%.1f * %s))", - alertingPageWindows[alertingPageWindowsOrder[2]].Record, mapToColonSeparatedString(burnRates[2].Labels), cfg.AlertingBurnRates.PageShortWindow, threshold, - alertingPageWindows[alertingPageWindowsOrder[0]].Record, mapToColonSeparatedString(burnRates[0].Labels), cfg.AlertingBurnRates.PageShortWindow, threshold, - alertingPageWindows[alertingPageWindowsOrder[3]].Record, mapToColonSeparatedString(burnRates[3].Labels), cfg.AlertingBurnRates.PageLongWindow, threshold, - alertingPageWindows[alertingPageWindowsOrder[1]].Record, mapToColonSeparatedString(burnRates[1].Labels), cfg.AlertingBurnRates.PageLongWindow, threshold, + alertingPageWindows[alertingPageWindowsOrder[2]].Record, mapToColonSeparatedString(burnRates[2].Labels), config.Cfg.AlertingBurnRates.PageShortWindow, threshold, + alertingPageWindows[alertingPageWindowsOrder[0]].Record, mapToColonSeparatedString(burnRates[0].Labels), config.Cfg.AlertingBurnRates.PageShortWindow, threshold, + alertingPageWindows[alertingPageWindowsOrder[3]].Record, mapToColonSeparatedString(burnRates[3].Labels), config.Cfg.AlertingBurnRates.PageLongWindow, threshold, + alertingPageWindows[alertingPageWindowsOrder[1]].Record, mapToColonSeparatedString(burnRates[1].Labels), config.Cfg.AlertingBurnRates.PageLongWindow, threshold, ) - } else if severity == "ticket" { + case config.TicketHigh, config.TicketMedium: alertExpression = fmt.Sprintf( "(%s{%s} > (%.1f * %s) and %s{%s} > (%.1f * %s)) or (%s{%s} > %.3f and %s{%s} > %.3f)", - alertingPageWindows[alertingPageWindowsOrder[4]].Record, mapToColonSeparatedString(burnRates[4].Labels), cfg.AlertingBurnRates.TicketShortWindow, threshold, - alertingPageWindows[alertingPageWindowsOrder[5]].Record, mapToColonSeparatedString(burnRates[5].Labels), cfg.AlertingBurnRates.TicketShortWindow, threshold, - alertingPageWindows[alertingPageWindowsOrder[6]].Record, mapToColonSeparatedString(burnRates[6].Labels), cfg.AlertingBurnRates.TicketLongWindow, - alertingPageWindows[alertingPageWindowsOrder[3]].Record, mapToColonSeparatedString(burnRates[3].Labels), cfg.AlertingBurnRates.TicketLongWindow, + alertingPageWindows[alertingPageWindowsOrder[4]].Record, mapToColonSeparatedString(burnRates[4].Labels), config.Cfg.AlertingBurnRates.TicketShortWindow, threshold, + alertingPageWindows[alertingPageWindowsOrder[5]].Record, mapToColonSeparatedString(burnRates[5].Labels), config.Cfg.AlertingBurnRates.TicketShortWindow, threshold, + alertingPageWindows[alertingPageWindowsOrder[6]].Record, mapToColonSeparatedString(burnRates[6].Labels), config.Cfg.AlertingBurnRates.TicketLongWindow, + alertingPageWindows[alertingPageWindowsOrder[3]].Record, mapToColonSeparatedString(burnRates[3].Labels), config.Cfg.AlertingBurnRates.TicketLongWindow, ) } + alertingTool := mrs.Slo.ObjectMeta.Annotations["osko.dev/alertingTool"] + if alertingTool == "" { + alertingTool = config.Cfg.AlertingTool + } + + severities := config.AlertSeveritiesByTool(alertingTool) + + toolSeverity := severities.GetSeverity(sreSeverity) + + log.V(1).Info("Alerting rule", "sreSeverity", sreSeverity, "toolSeverity", toolSeverity) + if alertExpression == "" { log.Info("Creation of alerting rule failed", "EmptyExpression", "Failed to create the alert expression, expression is empty") return monitoringv1.Rule{} } return monitoringv1.Rule{ - Alert: fmt.Sprintf("%s_alert_%s", burnRates[0].Record, severity), + Alert: fmt.Sprintf("%s_alert_%s", burnRates[0].Record, sreSeverity), Expr: intstr.FromString(alertExpression), For: duration, Labels: map[string]string{ - "severity": severity, + "severity": toolSeverity, }, Annotations: map[string]string{ "summary": "SLO Burn Rate Alert", @@ -407,8 +495,7 @@ func CreateAlertingRule() (*monitoringv1.PrometheusRule, error) { } func CreatePrometheusRule(slo *openslov1.SLO, sli *openslov1.SLI) (*monitoringv1.PrometheusRule, error) { - cfg := config.NewConfig() - baseWindow := cfg.DefaultBaseWindow.String() + baseWindow := model.Duration(config.Cfg.DefaultBaseWindow).String() if slo.ObjectMeta.Annotations["osko.dev/baseWindow"] != "" { baseWindow = slo.ObjectMeta.Annotations["osko.dev/baseWindow"] }