Nextdoor · scohen-nd · Nov 6, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024
@@ -0,0 +1,13 @@
+apiVersion: v2
+name: datadog-monitors
+description: datadog monitor alerts template
+type: application
+version: 0.0.1
+appVersion: latest
+maintainers:
+  - name: scohen
+    email: [email protected]
+dependencies:
+  - name: nd-common
+    version: 0.3.6
+    repository: file://../nd-common
@@ -0,0 +1,54 @@
+# datadog-monitors
+
+![Version: 0.0.1](https://img.shields.io/badge/Version-0.0.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: latest](https://img.shields.io/badge/AppVersion-latest-informational?style=flat-square)
+
+datadog monitor alerts template
+
+## Maintainers
+
+| Name | Email | Url |
+| ---- | ------ | --- |
+| scohen | <[email protected]> |  |
+
+## Requirements
+
+| Repository | Name | Version |
+|------------|------|---------|
+| file://../nd-common | nd-common | 0.3.6 |
+
+## Values
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| monitors | `map[string]interface{}` | `{"resourceName":{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}}` | List of monitors |
+| monitors.resourceName | `map[string]interface{}` | `{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}` | Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) |
+| monitors.resourceName.disabled | `boolean` | `true` | Optional: whether to exclude the monitor, defaults to False |
+| monitors.resourceName.message | `string` | `"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."` | Required: monitor message |
+| monitors.resourceName.name | `string` | `"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"` | Require: monitor name |
+| monitors.resourceName.options | `map[string]interface{}` | `{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}}` | Optional: monitor options |
+| monitors.resourceName.options.evaluationDelay | `string` | `300` | Optional: Time in seconds to wait before evaluating the monitor |
+| monitors.resourceName.options.groupbySimpleMonitor | `boolean` | `false` | Optional: A Boolean indicating Whether or not to group by simple monitor, triggers a single alert or multiple alerts when any group breaches the threshold. |
+| monitors.resourceName.options.includeTags | `boolean` | `false` | Optional: A Boolean indicating whether notifications from this monitor automatically insert its triggering tags into the title. |
+| monitors.resourceName.options.newGroupDelay | `string` | `300` | Optional: Time in seconds to allow a host to boot and applications to fully start before starting the evaluation. |
+| monitors.resourceName.options.noDataTimeframe | `int` | `30` | Optional: The number of minutes before a monitor notifies after data stops reporting. Datadog recommends at least 2x the monitor timeframe for metric alerts or 2 minutes for service checks. If omitted, 2x the evaluation timeframe is used for metric alerts, and 24 hours is used for service checks. |
+| monitors.resourceName.options.notifyBy | `string[]` | `[]` | Optional: List of labels indicating the granularity for a monitor to alert on. Only available for monitors with groupings. |
+| monitors.resourceName.options.notifyNoData | `boolean` | `false` | Optional: A Boolean indicating whether this monitor notifies when data stops reporting. |
+| monitors.resourceName.options.renotifyInterval | `int` | `0` | Optional: The number of minutes after the last notification before a monitor re-notifies on the current status. |
+| monitors.resourceName.options.renotifyOccurrences | `string[]` | `0` | Optional: The number of times re-notification messages should be sent on the current status at the provided re-notification interval. |
+| monitors.resourceName.options.renotifyStatus | `string[]` | `[]` | Optional: The types of statuses for which re-notification messages should be sent(Valid values are alert, warn, no data). |
+| monitors.resourceName.options.requireFullWindow | `boolean` | `false` | Optional: A Boolean indicating whether this monitor requires full window of data before it will fire, We highly recommend you set this to false for sparse metrics, otherwise some evaluations are skipped. |
+| monitors.resourceName.options.thresholdWindows | `map[string]string` | `{"alertWindow":"5m","recoveryWindow":"10m"}` | Optional: Threshold windows to finetune alerting |
+| monitors.resourceName.options.thresholdWindows.alertWindow | `string` | `"5m"` | Optional: Describes how long an anomalous metric must be anomalous before the alert fires. |
+| monitors.resourceName.options.thresholdWindows.recoveryWindow | `string` | `"10m"` | Optional: Describes how long an anomalous metric must be normal before the alert recovers. |
+| monitors.resourceName.options.thresholds | `map[string]string` | `{"critical":"1","warning":"0.28"}` | Optional: monitor thresholds |
+| monitors.resourceName.options.thresholds.critical | `string` | `"1"` | Optional: monitor critical threshold |
+| monitors.resourceName.options.thresholds.warning | `string` | `"0.28"` | Optional: monitor warning threshold |
+| monitors.resourceName.priority | `string` | `"2"` | Optional: monitor piority |
+| monitors.resourceName.query | `string` | `"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10"` | Required: monitor query |
+| monitors.resourceName.tags | `string[]` | `["tagname:tagvalue"]` | Optional: Additional monitor tags(will be added on top of the default tags:service, team, namespace, cluster, app_env, app_group) |
+| monitors.resourceName.type | `string` | `"query alert"` | Optional: monitor type, if not specified will default to 'query alert' Datadog monitor types to type values mapping: - anomaly: `query alert` - APM: `query alert` or `trace-analytics alert` - composite: `composite` - custom: `service check` - forecast: `query alert` - host: `service check` - integration: `query alert` or `service check` - live process: `process alert` - logs: `log alert` - metric: `query alert` - network: `service check` - outlier: `query alert` - process: `service check` - rum: `rum alert` - SLO: `slo alert` - watchdog: `event-v2 alert` - event-v2: `event-v2 alert` - audit: `audit alert` - error-tracking: `error-tracking alert` - database-monitoring: `database-monitoring alert` - network-performance: `network-performance alert` - service-discovery: `service-discovery alert` |
+| serviceName | `string` | `nil` | Optional shared pagerduty service name for monitors, will turn to a tag for alerts - if not provided, the .Release.name will be used by default |
+| team | `string` | `nil` | Optional shared pagerduty team name for monitors, will turn to a tag for alerts - if not provided, the tag will not be added |
+
+----------------------------------------------
+Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0)
@@ -0,0 +1 @@
+../values.local.yaml
@@ -0,0 +1,7 @@
+{{- define "datadog-monitors.shared-tags" -}}
+- "service:{{ default $.Release.Name .Values.serviceName }}"
+- "namespace:{{ .Release.Namespace }}"
+{{- with .Values.team }}
+- "team:{{ . }}"
+{{- end }}
+{{- end }}
@@ -0,0 +1,25 @@
+{{- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -}}
+{{-  range $name, $monitor := .Values.monitors }}
+{{- if not $monitor.disabled }}
+---
+apiVersion: datadoghq.com/v1alpha1
+kind: DatadogMonitor
+metadata:
+  name: {{ $name }}
+  labels: {{- include "nd-common.labels" $ | nindent 4 }}
+spec:
+  name: {{ $monitor.name | quote }}
+  message: {{ $monitor.message | quote }}
+  query: {{ $monitor.query  | quote }}
+  type: {{ default "query alert" $monitor.type | quote }}
+  {{- with $monitor.priority }}
+  priority: {{.}}
+  {{- end }}
+  tags: {{ include "datadog-monitors.shared-tags" $ | nindent 4 }}
+  {{- with $monitor.tags }}{{ toYaml . | nindent 4 }}{{- end }}
+  options:
+    locked: false
+    {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }}
+{{- end }}
+{{- end }}
+{{- end }}
@@ -0,0 +1,26 @@
+serviceName: eks
+team: cloudeng
+monitors:
+  failed-pods:
+    disabled: true
+    name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"
+    message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."
+    priority: "2"
+    query: "change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10"
+    type: "query alert"
+  datadog-log-alert-test:
+    query: "logs(\"source:nagios AND status:error\").index(\"default\").rollup(\"count\").last(\"1h\") > 5"
+    type: "log alert"
+    name: "Test log alert made from DatadogMonitor"
+    message: "1-2-3 testing"
+    tags:
+      - "test:datadog"
+    priority: 5
+    options:
+      enableLogsSample: true
+      evaluationDelay: 300
+      includeTags: true
+      locked: false
+      notifyNoData: true
+      noDataTimeframe: 30
+      renotifyInterval: 1440
@@ -0,0 +1,83 @@
+# -- (`string`) Optional shared pagerduty service name for monitors, will turn to a tag for alerts - if not provided, the .Release.name will be used by default
+serviceName:
+# -- (`string`) Optional shared pagerduty team name for monitors, will turn to a tag for alerts - if not provided, the tag will not be added
+team:
+
+# -- (`map[string]interface{}`) List of monitors
+monitors:
+  # -- (`map[string]interface{}`) Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name)
+  resourceName:
+    # -- (`boolean`) Optional: whether to exclude the monitor, defaults to False
+    disabled: true
+    # -- (`string`) Require: monitor name
+    name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"
+    # -- (`string`) Required: monitor message
+    message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."
+    # -- (`string`) Optional: monitor piority
+    priority: "2"
+    # -- (`string`) Required: monitor query
+    query: "change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10"
+    # -- (`string`) Optional: monitor type, if not specified will default to 'query alert'
+    # Datadog monitor types to type values mapping:
+    # - anomaly: `query alert`
+    # - APM: `query alert` or `trace-analytics alert`
+    # - composite: `composite`
+    # - custom: `service check`
+    # - forecast: `query alert`
+    # - host: `service check`
+    # - integration: `query alert` or `service check`
+    # - live process: `process alert`
+    # - logs: `log alert`
+    # - metric: `query alert`
+    # - network: `service check`
+    # - outlier: `query alert`
+    # - process: `service check`
+    # - rum: `rum alert`
+    # - SLO: `slo alert`
+    # - watchdog: `event-v2 alert`
+    # - event-v2: `event-v2 alert`
+    # - audit: `audit alert`
+    # - error-tracking: `error-tracking alert`
+    # - database-monitoring: `database-monitoring alert`
+    # - network-performance: `network-performance alert`
+    # - service-discovery: `service-discovery alert`
+    type: "query alert"
+    # -- (`string[]`) Optional: Additional monitor tags(will be added on top of the default tags:service, team, namespace, cluster, app_env, app_group)
+    tags:
+      - 'tagname:tagvalue'
+    # -- (`map[string]interface{}`) Optional: monitor options
+    options:
+      # -- (`map[string]string`) Optional: monitor thresholds
+      thresholds:
+        # -- (`string`) Optional: monitor critical threshold
+        critical: "1"
+        # -- (`string`) Optional: monitor warning threshold
+        warning: "0.28"
+      # -- (`string`) Optional: Time in seconds to wait before evaluating the monitor
+      evaluationDelay: 300
+      # -- (`boolean`) Optional: A Boolean indicating Whether or not to group by simple monitor, triggers a single alert or multiple alerts when any group breaches the threshold.
+      groupbySimpleMonitor: false
+      # -- (`boolean`) Optional: A Boolean indicating whether notifications from this monitor automatically insert its triggering tags into the title.
+      includeTags: False
+      # -- (`string`) Optional: Time in seconds to allow a host to boot and applications to fully start before starting the evaluation.
+      newGroupDelay: 300
+      # -- (`boolean`) Optional: A Boolean indicating whether this monitor notifies when data stops reporting.
+      notifyNoData: False
+      # -- (`int`) Optional: The number of minutes before a monitor notifies after data stops reporting. Datadog recommends at least 2x the monitor timeframe for metric alerts or 2 minutes for service checks. If omitted, 2x the evaluation timeframe is used for metric alerts, and 24 hours is used for service checks.
+      noDataTimeframe: 30
+      # -- (`int`) Optional: The number of minutes after the last notification before a monitor re-notifies on the current status.
+      renotifyInterval: 0
+      # -- (`string[]`) Optional: The number of times re-notification messages should be sent on the current status at the provided re-notification interval.
+      renotifyOccurrences: 0
+      # -- (`string[]`) Optional: The types of statuses for which re-notification messages should be sent(Valid values are alert, warn, no data).
+      renotifyStatus: []
+      # -- (`string[]`) Optional: List of labels indicating the granularity for a monitor to alert on. Only available for monitors with groupings.
+      notifyBy: []
+      # -- (`boolean`) Optional: A Boolean indicating whether this monitor requires full window of data before it will fire, We highly recommend you set this to false for sparse metrics, otherwise some evaluations are skipped.
+      requireFullWindow: false
+      # -- (`map[string]string`) Optional: Threshold windows to finetune alerting
+      thresholdWindows:
+        # -- (`string`) Optional: Describes how long an anomalous metric must be normal before the alert recovers.
+        recoveryWindow: "10m"
+        # -- (`string`) Optional: Describes how long an anomalous metric must be anomalous before the alert fires.
+        alertWindow: "5m"