Skip to content

Commit

Permalink
feat(base-cluster/monitoring): add pagerduty alertmanager receiver (#653
Browse files Browse the repository at this point in the history
)
  • Loading branch information
cwrau authored Jan 3, 2024
1 parent c5ccc48 commit fba2f36
Show file tree
Hide file tree
Showing 15 changed files with 397 additions and 14 deletions.
6 changes: 2 additions & 4 deletions charts/base-cluster/README.md.gotmpl
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ to present the results.
#### Sub-Component [tracing](#monitoring_tracing)

The included [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/)
collects traces via otlp-grpc on every node on the host IP.
collects traces via otlp-grpc on every node via the `open-telemetry-collector-opentelemetry-collector.monitoring` service.
These traces are then sent to [Grafana Tempo](https://grafana.com/oss/tempo/),
which is included as a datasource in Grafana by default.

Expand All @@ -187,9 +187,7 @@ spec:
containers:
- env:
- name: OTEL_HOST <- change this to your framework's environment variable
valueFrom:
fieldRef:
fieldPath: status.hostIP
value: open-telemetry-collector-opentelemetry-collector.monitoring
- name: OTEL_PORT
value: "4317"
```
Expand Down
10 changes: 10 additions & 0 deletions charts/base-cluster/ci/artifacthub-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,13 @@ monitoring:
adminPassword: test
tracing:
enabled: true
deadMansSwitch:
enabled: true
pingKey: PING_KEY
apiKey: API_KEY
prometheus:
alertmanager:
receivers:
pagerduty:
enabled: true
integrationKey: INTEGRATION_KEY
5 changes: 5 additions & 0 deletions charts/base-cluster/ci/deadmansswitch-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
monitoring:
deadMansSwitch:
enabled: true
pingKey: PING_KEY
apiKey: API_KEY
7 changes: 7 additions & 0 deletions charts/base-cluster/ci/pagerduty-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
monitoring:
prometheus:
alertmanager:
receivers:
pagerduty:
enabled: true
integrationKey: INTEGRATION_KEY
4 changes: 4 additions & 0 deletions charts/base-cluster/templates/_images.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
{{- include "common.images.image" (dict "imageRoot" .Values.global.kubectl.image "global" .Values.global) -}}
{{- end -}}

{{- define "base-cluster.curl.image" -}}
{{- include "common.images.image" (dict "imageRoot" .Values.global.curl.image "global" .Values.global) -}}
{{- end -}}

{{- define "base-cluster.flux.image" -}}
{{- include "common.images.image" (dict "imageRoot" .Values.global.flux.image "global" .Values.global) -}}
{{- end -}}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{{- if eq (include "common.networkPolicy.type" .) "cilium" }}
apiVersion: cilium.io/v2
kind: CiliumNetworkPolicy
metadata:
name: dead-mans-switch
namespace: monitoring
labels: {{- include "common.labels.standard" $ | nindent 4 }}
app.kubernetes.io/component: dead-mans-switch
spec:
endpointSelector:
matchLabels: {{- include "common.labels.matchLabels" $ | nindent 6 }}
app.kubernetes.io/component: dead-mans-switch
ingress:
- { }
egress:
- toFQDNs:
- matchName: hc-ping.com
- matchName: healthchecks.io
toPorts:
- ports:
- port: "443"
protocol: TCP
- toServices:
- k8sServiceSelector:
selector:
matchLabels:
k8s-app: kube-dns
namespace: kube-system
toPorts:
- ports:
- port: "53"
protocol: UDP
rules:
dns:
- matchName: hc-ping.com
- matchName: healthchecks.io
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
{{- if .Values.monitoring.deadMansSwitch.enabled }}
{{- if false }}
apiVersion: batch/v1
{{- else }}
apiVersion: {{ include "common.capabilities.cronjob.apiVersion" . }}
{{- end }}
kind: CronJob
metadata:
name: dead-mans-switch
namespace: monitoring
labels: {{- include "common.labels.standard" $ | nindent 4 }}
app.kubernetes.io/component: dead-mans-switch
spec:
concurrencyPolicy: Forbid
startingDeadlineSeconds: 50
schedule: "* * * * *" # Every minute
jobTemplate:
spec:
template:
metadata:
labels: {{- include "common.labels.standard" $ | nindent 12 }}
app.kubernetes.io/component: dead-mans-switch
spec:
securityContext:
runAsGroup: 1000
runAsUser: 1000
runAsNonRoot: true
fsGroup: 1000
automountServiceAccountToken: false
restartPolicy: OnFailure
containers:
- name: watchdog
image: {{ template "base-cluster.curl.image" . }}
{{- if .Values.global.kubectl.image.digest }}
imagePullPolicy: IfNotPresent
{{- else }}
imagePullPolicy: Always
{{- end }}
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
privileged: false
readOnlyRootFilesystem: true
env:
- name: PING_KEY
valueFrom:
secretKeyRef:
name: dead-mans-switch
key: pingKey
command:
- curl
- --silent
- --show-error
- --fail
- --retry
- '5'
- --max-time
- '30'
- {{ printf "https://hc-ping.com/$(PING_KEY)/k8s-cluster-%s-%s-scheduling" (.Values.global.baseDomain | replace "." "-") .Values.global.clusterName }}
resources:
requests:
cpu: 100m
memory: 16Mi
limits:
cpu: 100m
memory: 16Mi
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{{- if .Values.monitoring.deadMansSwitch.enabled -}}
{{- $secret := include (print .Template.BasePath "/monitoring/deadMansSwitch/secret.yaml") . | fromYaml -}}
{{- $secret = mustMerge (
dict "metadata" (
dict "annotations" (dict
"helm.sh/hook" "pre-install,pre-upgrade,pre-delete"
"helm.sh/hook-delete-policy" "before-hook-creation,hook-succeeded,hook-failed"
)
"namespace" .Release.Namespace
)
)
$secret
-}}
{{- $secret | toYaml -}}
{{- end -}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
{{- if .Values.monitoring.deadMansSwitch.enabled }}
apiVersion: batch/v1
kind: Job
metadata:
name: dead-mans-switch-registration
namespace: {{ $.Release.Namespace }}
labels: {{- include "common.labels.standard" $ | nindent 4 }}
app.kubernetes.io/component: dead-mans-switch
annotations:
helm.sh/hook: pre-install,pre-upgrade
helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded,hook-failed
spec:
template:
spec:
securityContext:
runAsGroup: 1000
runAsUser: 1000
runAsNonRoot: true
fsGroup: 1000
automountServiceAccountToken: false
restartPolicy: OnFailure
containers:
- name: register
image: {{ include "base-cluster.curl.image" . }}
{{- if .Values.global.kubectl.image.digest }}
imagePullPolicy: IfNotPresent
{{- else }}
imagePullPolicy: Always
{{- end }}
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
privileged: false
readOnlyRootFilesystem: true
env:
- name: PING_KEY
valueFrom:
secretKeyRef:
name: dead-mans-switch
key: pingKey
- name: API_KEY
valueFrom:
secretKeyRef:
name: dead-mans-switch
key: apiKey
command:
- ash
- -e
- -c
- |
set -o pipefail
set -x
function createCheck() {
local checkName="$1"
local data='{"name": "'"$checkName"'", "slug": "'"$checkName"'", "tags": "k8s {{ .Values.global.clusterName -}}", "timeout": 120, "grace": 60, "channels": "{{- .Values.global.clusterName -}}", "unique": ["name"]}'
curl --silent --show-error --fail --retry 5 --max-time 30 --header "X-Api-Key: $API_KEY" https://healthchecks.io/api/v3/checks/ --data "$data"
curl --silent --show-error --fail --retry 5 --max-time 30 "https://hc-ping.com/$PING_KEY/$checkName"
}
checkName={{- printf "k8s-cluster-%s-%s" (.Values.global.baseDomain | replace "." "-") .Values.global.clusterName | quote }}
{{- if .Values.monitoring.prometheus.enabled }}
createCheck "$checkName-monitoring"
{{- end }}
createCheck "$checkName-scheduling"
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{{- if .Values.monitoring.deadMansSwitch.enabled -}}
apiVersion: v1
kind: Secret
metadata:
name: dead-mans-switch
namespace: monitoring
labels: {{- include "common.labels.standard" $ | nindent 4 }}
app.kubernetes.io/component: dead-mans-switch
type: Opaque
stringData:
apiKey: {{ .Values.monitoring.deadMansSwitch.apiKey | required "You need to provide the `.Values.monitoring.deadMansSwitch.apiKey`" | quote }}
pingKey: {{ .Values.monitoring.deadMansSwitch.pingKey | required "You need to provide the `.Values.monitoring.deadMansSwitch.pingKey`" | quote }}
{{- end -}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
{{- if lookup "v1" "Secret" "monitoring" "dead-mans-switch" }}
apiVersion: batch/v1
kind: Job
metadata:
name: dead-mans-switch-unregister
namespace: {{ $.Release.Namespace }}
labels: {{- include "common.labels.standard" $ | nindent 4 }}
app.kubernetes.io/component: dead-mans-switch
annotations:
helm.sh/hook: pre-delete
helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded,hook-failed
spec:
template:
spec:
securityContext:
runAsGroup: 1000
runAsUser: 1000
runAsNonRoot: true
fsGroup: 1000
automountServiceAccountToken: false
restartPolicy: OnFailure
containers:
- name: unregister
image: {{ include "base-cluster.curl.image" . }}
{{- if .Values.global.kubectl.image.digest }}
imagePullPolicy: IfNotPresent
{{- else }}
imagePullPolicy: Always
{{- end }}
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
privileged: false
readOnlyRootFilesystem: true
env:
- name: API_KEY
valueFrom:
secretKeyRef:
name: dead-mans-switch
key: apiKey
command:
- ash
- -e
- -c
- |
set -o pipefail
set -x
function deleteCheck() {
local checkName="$1"
local existingChecks
local existingCheckUUID
existingCheckUUID="$(curl --silent --fail --retry 5 --max-time 30 --header "X-Api-Key: $apiKey" "https://healthchecks.io/api/v3/checks/?slug=$checkName" | sed -r 's#^.*ping_url":\s*"[^"]+/([^"]+)".*$#\1#')"
if curl --silent --show-error --fail --retry 5 --max-time 30 --header "X-Api-Key: $API_KEY" "https://healthchecks.io/api/v3/checks/$existingCheckUUID" > /dev/null; then
curl --silent --show-error --fail --retry 5 --max-time 30 --header "X-Api-Key: $API_KEY" "https://healthchecks.io/api/v3/checks/$existingCheckUUID" --request DELETE
fi
}
checkName={{- printf "k8s-cluster-%s-%s" (.Values.global.baseDomain | replace "." "-") .Values.global.clusterName | quote }}
{{- if .Values.monitoring.prometheus.enabled }}
deleteCheck "$checkName-monitoring"
{{- end }}
deleteCheck "$checkName-scheduling"
{{- end }}
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
{{- define "base-cluster.prometheus-stack.alertmanager.config" -}}
enabled: false # TODO dependent on routes/receivers
# TODO routes
# TODO receivers
{{- if false }}
{{- $enabled := false -}}
{{- range $_, $receiver := .Values.monitoring.prometheus.alertmanager.receivers -}}
{{- $enabled = or $enabled $receiver.enabled -}}
{{- end -}}
enabled: {{ $enabled }}
{{- if $enabled }}
podDisruptionBudget:
enabled: true
{{- if include "base-cluster.monitoring.unauthenticated-ingress.enabled" (dict "name" "alertmanager" "context" .) }}
Expand All @@ -26,5 +28,41 @@ alertmanagerSpec:
storage: {{ .Values.monitoring.prometheus.alertmanager.persistence.size }}
alertmanagerConfigSelector:
matchLabels: {{- .Values.monitoring.labels | toYaml | nindent 6 }}
config:
{{- if .Values.monitoring.prometheus.alertmanager.receivers.pagerduty.enabled }}
global:
pagerduty_url: {{ .Values.monitoring.prometheus.alertmanager.receivers.pagerduty.url | required "You need to provide the `.Values.monitoring.promteheus.alertmanager.receivers.pagerduty.url`" | quote }}
{{- end }}
receivers:
{{- if .Values.monitoring.prometheus.alertmanager.receivers.pagerduty.enabled }}
- name: pagerduty
pagerduty_configs:
- routing_key: {{ .Values.monitoring.prometheus.alertmanager.receivers.pagerduty.integrationKey | required "You need to provide the `.Values.monitoring.prometheus.alertmanager.receivers.pagerduty.integrationKey`" }}
send_resolved: true
http_config:
follow_redirects: true
{{- end }}
{{- if and .Values.monitoring.deadMansSwitch.enabled .Values.global.baseDomain .Values.global.clusterName }}
- name: healthchecks.io
webhook_configs:
- url: {{ printf "https://hc-ping.com/%s/k8s-cluster-%s-%s-monitoring" .Values.monitoring.deadMansSwitch.pingKey (.Values.global.baseDomain | replace "." "-") .Values.global.clusterName }}
send_resolved: false
{{- end }}
- name: "null"
route:
{{- if .Values.monitoring.prometheus.alertmanager.receivers.pagerduty.enabled }}
receiver: pagerduty
{{- end }}
routes:
{{- if .Values.monitoring.deadMansSwitch.enabled }}
- match:
alertname: Watchdog
receiver: healthchecks.io
group_interval: 1m
repeat_interval: 1m
{{- end }}
- match:
alertname: InfoInhibitor
receiver: "null"
{{- end }}
{{- end -}}
Loading

0 comments on commit fba2f36

Please sign in to comment.