From aa1994f966ff958c917ddfeab4a61e94e405798f Mon Sep 17 00:00:00 2001 From: Vlad Gusev Date: Mon, 23 Dec 2024 15:58:15 +0200 Subject: [PATCH] [pxc-db] Add kube-state-metrics-based alerts for PXC cluster Add kube-state-metrics-based alerts: * GaleraClusterResourceNotReady * GaleraClusterBackupNotSucceeded * GaleraClusterBackupMissing --- common/pxc-db/Chart.yaml | 2 +- common/pxc-db/templates/alerts.yaml | 3 ++ .../templates/alerts/_backup.alerts.tpl | 29 +++++++++++++++++++ .../pxc-db/templates/alerts/_pxc.alerts.tpl | 16 ++++++++++ 4 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 common/pxc-db/templates/alerts/_backup.alerts.tpl diff --git a/common/pxc-db/Chart.yaml b/common/pxc-db/Chart.yaml index f8722e23ed8..f3737cfb86f 100644 --- a/common/pxc-db/Chart.yaml +++ b/common/pxc-db/Chart.yaml @@ -16,7 +16,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.2.1 +version: 0.2.2 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/common/pxc-db/templates/alerts.yaml b/common/pxc-db/templates/alerts.yaml index dec13d38943..a9f532d233e 100644 --- a/common/pxc-db/templates/alerts.yaml +++ b/common/pxc-db/templates/alerts.yaml @@ -12,4 +12,7 @@ metadata: spec: groups: {{ include (print .Template.BasePath "/alerts/_pxc.alerts.tpl") . | indent 2 }} +{{- if .Values.backup.enabled }} +{{ include (print .Template.BasePath "/alerts/_backup.alerts.tpl") . | indent 2 }} +{{- end }} {{- end }} diff --git a/common/pxc-db/templates/alerts/_backup.alerts.tpl b/common/pxc-db/templates/alerts/_backup.alerts.tpl new file mode 100644 index 00000000000..a122eabbf03 --- /dev/null +++ b/common/pxc-db/templates/alerts/_backup.alerts.tpl @@ -0,0 +1,29 @@ +- name: pxc-backup.alerts + rules: + - alert: {{ include "pxc-db.alerts.service" . | camelcase }}GaleraClusterBackupNotSucceeded + expr: (kube_customresource_perconaxtradbclusterbackup_status{app_kubernetes_io_instance="{{ include "pxc-db.fullname" . }}",state="Succeeded"} != 1) + for: 10m + labels: + context: database + service: {{ include "pxc-db.alerts.service" . }} + severity: info + tier: {{ required ".Values.alerts.tier missing" .Values.alerts.tier }} + playbook: '' + support_group: {{ required ".Values.alerts.support_group missing" .Values.alerts.support_group }} + annotations: + description: "{{ include "pxc-db.fullname" . }} cluster backup is not succeeded." + summary: "{{ include "pxc-db.fullname" . }} cluster backup is not succeeded." + + - alert: {{ include "pxc-db.alerts.service" . | camelcase }}GaleraClusterBackupMissing + expr: (time() - max by (app_kubernetes_io_instance) (kube_customresource_perconaxtradbclusterbackup_completed{app_kubernetes_io_instance="{{ include "pxc-db.fullname" . }}") > 129600) + for: 30m + labels: + context: database + service: {{ include "pxc-db.alerts.service" . }} + severity: info + tier: {{ required ".Values.alerts.tier missing" .Values.alerts.tier }} + playbook: '' + support_group: {{ required ".Values.alerts.support_group missing" .Values.alerts.support_group }} + annotations: + description: "{{ include "pxc-db.fullname" . }} cluster has no new backups completed earlier than 36 hours ago." + summary: "{{ include "pxc-db.fullname" . }} cluster has no new backups completed earlier than 36 hours ago." diff --git a/common/pxc-db/templates/alerts/_pxc.alerts.tpl b/common/pxc-db/templates/alerts/_pxc.alerts.tpl index 9cba2602c88..996fb6f542b 100644 --- a/common/pxc-db/templates/alerts/_pxc.alerts.tpl +++ b/common/pxc-db/templates/alerts/_pxc.alerts.tpl @@ -8,6 +8,7 @@ service: {{ include "pxc-db.alerts.service" . }} severity: info tier: {{ required ".Values.alerts.tier missing" .Values.alerts.tier }} + playbook: '' support_group: {{ required ".Values.alerts.support_group missing" .Values.alerts.support_group }} annotations: description: {{ include "pxc-db.fullname" . }} has too many connections open. Please check the service containers. @@ -35,6 +36,7 @@ service: {{ include "pxc-db.alerts.service" . }} severity: info tier: {{ required ".Values.alerts.tier missing" .Values.alerts.tier }} + playbook: '' support_group: {{ required ".Values.alerts.support_group missing" .Values.alerts.support_group }} annotations: description: {{ include "pxc-db.fullname" . }} has queries waiting for lock more than 20 sec. Deadlock possible. @@ -137,3 +139,17 @@ annotations: description: "{{ include "pxc-db.fullname" . }} Galera cluster reports at least 1 node with 25% paused replication in the last 30 minutes" summary: "{{ include "pxc-db.fullname" . }} Galera cluster node replication paused" + + - alert: {{ include "pxc-db.alerts.service" . | camelcase }}GaleraClusterResourceNotReady + expr: (kube_customresource_perconaxtradbcluster_status{app="{{ include "pxc-db.fullname" . }}",state='ready'} != 1) + for: 10m + labels: + context: database + service: {{ include "pxc-db.alerts.service" . }} + severity: info + tier: {{ required ".Values.alerts.tier missing" .Values.alerts.tier }} + playbook: '' + support_group: {{ required ".Values.alerts.support_group missing" .Values.alerts.support_group }} + annotations: + description: "{{ include "pxc-db.fullname" . }} cluster resource is not in ready state." + summary: "{{ include "pxc-db.fullname" . }} cluster resource is not in ready state."