Skip to content

Commit

Permalink
[pxc-db] Add kube-state-metrics-based alerts for PXC cluster
Browse files Browse the repository at this point in the history
Add kube-state-metrics-based alerts:

* GaleraClusterResourceNotReady
* GaleraClusterBackupNotSucceeded
* GaleraClusterBackupMissing
  • Loading branch information
s10 committed Dec 23, 2024
1 parent d3644bb commit aa1994f
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 1 deletion.
2 changes: 1 addition & 1 deletion common/pxc-db/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.2.1
version: 0.2.2

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
3 changes: 3 additions & 0 deletions common/pxc-db/templates/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,7 @@ metadata:
spec:
groups:
{{ include (print .Template.BasePath "/alerts/_pxc.alerts.tpl") . | indent 2 }}
{{- if .Values.backup.enabled }}
{{ include (print .Template.BasePath "/alerts/_backup.alerts.tpl") . | indent 2 }}
{{- end }}
{{- end }}
29 changes: 29 additions & 0 deletions common/pxc-db/templates/alerts/_backup.alerts.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
- name: pxc-backup.alerts
rules:
- alert: {{ include "pxc-db.alerts.service" . | camelcase }}GaleraClusterBackupNotSucceeded
expr: (kube_customresource_perconaxtradbclusterbackup_status{app_kubernetes_io_instance="{{ include "pxc-db.fullname" . }}",state="Succeeded"} != 1)
for: 10m
labels:
context: database
service: {{ include "pxc-db.alerts.service" . }}
severity: info
tier: {{ required ".Values.alerts.tier missing" .Values.alerts.tier }}
playbook: ''
support_group: {{ required ".Values.alerts.support_group missing" .Values.alerts.support_group }}
annotations:
description: "{{ include "pxc-db.fullname" . }} cluster backup is not succeeded."
summary: "{{ include "pxc-db.fullname" . }} cluster backup is not succeeded."

- alert: {{ include "pxc-db.alerts.service" . | camelcase }}GaleraClusterBackupMissing
expr: (time() - max by (app_kubernetes_io_instance) (kube_customresource_perconaxtradbclusterbackup_completed{app_kubernetes_io_instance="{{ include "pxc-db.fullname" . }}") > 129600)
for: 30m
labels:
context: database
service: {{ include "pxc-db.alerts.service" . }}
severity: info
tier: {{ required ".Values.alerts.tier missing" .Values.alerts.tier }}
playbook: ''
support_group: {{ required ".Values.alerts.support_group missing" .Values.alerts.support_group }}
annotations:
description: "{{ include "pxc-db.fullname" . }} cluster has no new backups completed earlier than 36 hours ago."
summary: "{{ include "pxc-db.fullname" . }} cluster has no new backups completed earlier than 36 hours ago."
16 changes: 16 additions & 0 deletions common/pxc-db/templates/alerts/_pxc.alerts.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
service: {{ include "pxc-db.alerts.service" . }}
severity: info
tier: {{ required ".Values.alerts.tier missing" .Values.alerts.tier }}
playbook: ''
support_group: {{ required ".Values.alerts.support_group missing" .Values.alerts.support_group }}
annotations:
description: {{ include "pxc-db.fullname" . }} has too many connections open. Please check the service containers.
Expand Down Expand Up @@ -35,6 +36,7 @@
service: {{ include "pxc-db.alerts.service" . }}
severity: info
tier: {{ required ".Values.alerts.tier missing" .Values.alerts.tier }}
playbook: ''
support_group: {{ required ".Values.alerts.support_group missing" .Values.alerts.support_group }}
annotations:
description: {{ include "pxc-db.fullname" . }} has queries waiting for lock more than 20 sec. Deadlock possible.
Expand Down Expand Up @@ -137,3 +139,17 @@
annotations:
description: "{{ include "pxc-db.fullname" . }} Galera cluster reports at least 1 node with 25% paused replication in the last 30 minutes"
summary: "{{ include "pxc-db.fullname" . }} Galera cluster node replication paused"

- alert: {{ include "pxc-db.alerts.service" . | camelcase }}GaleraClusterResourceNotReady
expr: (kube_customresource_perconaxtradbcluster_status{app="{{ include "pxc-db.fullname" . }}",state='ready'} != 1)
for: 10m
labels:
context: database
service: {{ include "pxc-db.alerts.service" . }}
severity: info
tier: {{ required ".Values.alerts.tier missing" .Values.alerts.tier }}
playbook: ''
support_group: {{ required ".Values.alerts.support_group missing" .Values.alerts.support_group }}
annotations:
description: "{{ include "pxc-db.fullname" . }} cluster resource is not in ready state."
summary: "{{ include "pxc-db.fullname" . }} cluster resource is not in ready state."

0 comments on commit aa1994f

Please sign in to comment.