From 9075380f36d3cbafd8e19195649d2a6a06a70e20 Mon Sep 17 00:00:00 2001 From: Shalom Cohen Date: Tue, 5 Nov 2024 15:01:19 -0500 Subject: [PATCH 01/11] feat: Adding new datadog monitors chart --- charts/datadog-monitors/Chart.yaml | 13 +++++ charts/datadog-monitors/ci/local-values.yaml | 1 + .../datadog-monitors/templates/_helpers.tpl | 7 +++ .../templates/datadog-monitors.yaml | 21 ++++++++ charts/datadog-monitors/values.local.yaml | 25 +++++++++ charts/datadog-monitors/values.yaml | 54 +++++++++++++++++++ 6 files changed, 121 insertions(+) create mode 100644 charts/datadog-monitors/Chart.yaml create mode 120000 charts/datadog-monitors/ci/local-values.yaml create mode 100644 charts/datadog-monitors/templates/_helpers.tpl create mode 100644 charts/datadog-monitors/templates/datadog-monitors.yaml create mode 100644 charts/datadog-monitors/values.local.yaml create mode 100644 charts/datadog-monitors/values.yaml diff --git a/charts/datadog-monitors/Chart.yaml b/charts/datadog-monitors/Chart.yaml new file mode 100644 index 00000000..6d382fe9 --- /dev/null +++ b/charts/datadog-monitors/Chart.yaml @@ -0,0 +1,13 @@ +apiVersion: v2 +name: datadog-monitors +description: datadog monitor alerts template +type: application +version: 0.0.1 +appVersion: latest +maintainers: + - name: scohen + email: scohen@nextdoor.com +dependencies: + - name: nd-common + version: 0.3.6 + repository: file://../nd-common \ No newline at end of file diff --git a/charts/datadog-monitors/ci/local-values.yaml b/charts/datadog-monitors/ci/local-values.yaml new file mode 120000 index 00000000..123e79ec --- /dev/null +++ b/charts/datadog-monitors/ci/local-values.yaml @@ -0,0 +1 @@ +../values.local.yaml \ No newline at end of file diff --git a/charts/datadog-monitors/templates/_helpers.tpl b/charts/datadog-monitors/templates/_helpers.tpl new file mode 100644 index 00000000..018d80d1 --- /dev/null +++ b/charts/datadog-monitors/templates/_helpers.tpl @@ -0,0 +1,7 @@ +{{- define "datadog-monitors.shared-tags" -}} +- "service:{{ default $.Release.Name .Values.serviceName }}" +- "namespace:{{ .Release.Namespace }}" +{{- with .Values.team }} +- "team:{{ . }}" +{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml new file mode 100644 index 00000000..b8a91ecf --- /dev/null +++ b/charts/datadog-monitors/templates/datadog-monitors.yaml @@ -0,0 +1,21 @@ +{{- range $name, $monitor := .Values.monitors }} +--- +apiVersion: datadoghq.com/v1alpha1 +kind: DatadogMonitor +metadata: + name: {{ $name }} + labels: {{- include "nd-common.labels" $ | nindent 4 }} +spec: + name: {{ $monitor.name | quote }} + message: {{ $monitor.message | quote }} + query: {{ $monitor.query | quote }} + type: {{ default "query alert" $monitor.type | quote }} + {{- with $monitor.priority }} + priority: {{.}} + {{- end }} + tags: {{ include "datadog-monitors.shared-tags" $ | nindent 4 }} + {{- with $monitor.tags }}{{ toYaml . | nindent 4 }}{{- end }} + options: + locked: false + {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml new file mode 100644 index 00000000..fd5be215 --- /dev/null +++ b/charts/datadog-monitors/values.local.yaml @@ -0,0 +1,25 @@ +serviceName: eks +team: cloudeng +monitors: + failed-pods: + name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces" + message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs." + priority: "2" + query: "change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10" + type: "query alert" + datadog-log-alert-test: + query: "logs(\"source:nagios AND status:error\").index(\"default\").rollup(\"count\").last(\"1h\") > 5" + type: "log alert" + name: "Test log alert made from DatadogMonitor" + message: "1-2-3 testing" + tags: + - "test:datadog" + priority: 5 + options: + enableLogsSample: true + evaluationDelay: 300 + includeTags: true + locked: false + notifyNoData: true + noDataTimeframe: 30 + renotifyInterval: 1440 \ No newline at end of file diff --git a/charts/datadog-monitors/values.yaml b/charts/datadog-monitors/values.yaml new file mode 100644 index 00000000..dfff32e5 --- /dev/null +++ b/charts/datadog-monitors/values.yaml @@ -0,0 +1,54 @@ +# Default monitor values +serviceName: # Default pagerduty service name for the alerts, will turn to a tag for alerts - if not provided, the .Release.name will be used by default +team: # Default pagerduty team name for the alerts, will turn to a tag for alerts - if not provided, the tag will not be added + +# Placeholder for default datadog monitors +monitors: {} +# failed-pods: # Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) +# name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces" # Required monitor name +# message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs." # Required monitor message +# priority: "2" # Optional alert severity/priority +# query: "change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10" # Required alert query +# type: "query alert" # Optional, defaults to 'query alert'. +# // The type of monitor chosen from: +# // - anomaly: `query alert` +# // - APM: `query alert` or `trace-analytics alert` +# // - composite: `composite` +# // - custom: `service check` +# // - forecast: `query alert` +# // - host: `service check` +# // - integration: `query alert` or `service check` +# // - live process: `process alert` +# // - logs: `log alert` +# // - metric: `query alert` +# // - network: `service check` +# // - outlier: `query alert` +# // - process: `service check` +# // - rum: `rum alert` +# // - SLO: `slo alert` +# // - watchdog: `event-v2 alert` +# // - event-v2: `event-v2 alert` +# // - audit: `audit alert` +# // - error-tracking: `error-tracking alert` +# // - database-monitoring: `database-monitoring alert` +# // - network-performance: `network-performance alert` +# tags: # Optional list of tags (will be added on top of the default tags:service, team, namespace, cluster, app_env, app_group) +# - 'tagname:tagvalue' +# options: # Optional monitor parameters +# thresholds: # Optional alert thresholds +# critical: "1" # Optional critical threshold +# warning: "0.28" # Optional warning threshold, critical threshold will be required if warning is specified +# evaluationDelay: 300 # Time in seconds to wait before evaluating the monitor +# groupbySimpleMonitor: false # Whether or not to group by simple monitor, triggers a single alert or multiple alerts when any group breaches the threshold +# includeTags: false # A Boolean indicating whether notifications from this monitor automatically inserts its triggering tags into the title. +# newGroupDelay: 300 # Time (in seconds) to allow a host to boot and applications to fully start before starting the evaluation +# notifyNoData: false # A Boolean indicating whether this monitor notifies when data stops reporting. +# noDataTimeframe: 30 # The number of minutes before a monitor notifies after data stops reporting. Datadog recommends at least 2x the monitor timeframe for metric alerts or 2 minutes for service checks. If omitted, 2x the evaluation timeframe is used for metric alerts, and 24 hours is used for service checks. +# renotifyInterval: 0 # The number of minutes after the last notification before a monitor re-notifies on the current status. +# renotifyOccurrences: 0 # The number of times re-notification messages should be sent on the current status at the provided re-notification interval. +# renotifyStatus: [] # The types of statuses for which re-notification messages should be sent. Valid values are alert, warn, no data. +# notifyBy: [] # List of labels indicating the granularity for a monitor to alert on. Only available for monitors with groupings. +# requireFullWindow: false # A Boolean indicating whether this monitor requires full window of data before it will fire. We highly recommend you set this to false for sparse metrics, otherwise some evaluations are skipped. +# thresholdWindows: # Threshold windows to finetune alerting +# recoveryWindow: "10m" # Describes how long an anomalous metric must be normal before the alert recovers. +# alertWindow: "5m" # Describes how long an anomalous metric must be anomalous before the alert fires. \ No newline at end of file From 5bdef9969cc12a27f7f5500e5636849057b8fdcc Mon Sep 17 00:00:00 2001 From: Shalom Cohen Date: Tue, 5 Nov 2024 15:03:45 -0500 Subject: [PATCH 02/11] Add new lines at the end of each file --- charts/datadog-monitors/Chart.yaml | 2 +- charts/datadog-monitors/templates/_helpers.tpl | 2 +- charts/datadog-monitors/templates/datadog-monitors.yaml | 2 +- charts/datadog-monitors/values.local.yaml | 2 +- charts/datadog-monitors/values.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/charts/datadog-monitors/Chart.yaml b/charts/datadog-monitors/Chart.yaml index 6d382fe9..73597b77 100644 --- a/charts/datadog-monitors/Chart.yaml +++ b/charts/datadog-monitors/Chart.yaml @@ -10,4 +10,4 @@ maintainers: dependencies: - name: nd-common version: 0.3.6 - repository: file://../nd-common \ No newline at end of file + repository: file://../nd-common diff --git a/charts/datadog-monitors/templates/_helpers.tpl b/charts/datadog-monitors/templates/_helpers.tpl index 018d80d1..80b076b0 100644 --- a/charts/datadog-monitors/templates/_helpers.tpl +++ b/charts/datadog-monitors/templates/_helpers.tpl @@ -4,4 +4,4 @@ {{- with .Values.team }} - "team:{{ . }}" {{- end }} -{{- end }} \ No newline at end of file +{{- end }} diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml index b8a91ecf..bd802d53 100644 --- a/charts/datadog-monitors/templates/datadog-monitors.yaml +++ b/charts/datadog-monitors/templates/datadog-monitors.yaml @@ -18,4 +18,4 @@ spec: options: locked: false {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }} -{{- end }} \ No newline at end of file +{{- end }} diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml index fd5be215..aefa08c2 100644 --- a/charts/datadog-monitors/values.local.yaml +++ b/charts/datadog-monitors/values.local.yaml @@ -22,4 +22,4 @@ monitors: locked: false notifyNoData: true noDataTimeframe: 30 - renotifyInterval: 1440 \ No newline at end of file + renotifyInterval: 1440 diff --git a/charts/datadog-monitors/values.yaml b/charts/datadog-monitors/values.yaml index dfff32e5..46d60536 100644 --- a/charts/datadog-monitors/values.yaml +++ b/charts/datadog-monitors/values.yaml @@ -51,4 +51,4 @@ monitors: {} # requireFullWindow: false # A Boolean indicating whether this monitor requires full window of data before it will fire. We highly recommend you set this to false for sparse metrics, otherwise some evaluations are skipped. # thresholdWindows: # Threshold windows to finetune alerting # recoveryWindow: "10m" # Describes how long an anomalous metric must be normal before the alert recovers. -# alertWindow: "5m" # Describes how long an anomalous metric must be anomalous before the alert fires. \ No newline at end of file +# alertWindow: "5m" # Describes how long an anomalous metric must be anomalous before the alert fires. From 5d04770f27a8e169f2b2ae02c99f8e4f3f2edca3 Mon Sep 17 00:00:00 2001 From: Shalom Cohen Date: Tue, 5 Nov 2024 15:20:54 -0500 Subject: [PATCH 03/11] Making sure to install the datadog monitor CRD only if it is available --- charts/datadog-monitors/templates/datadog-monitors.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml index bd802d53..f150e826 100644 --- a/charts/datadog-monitors/templates/datadog-monitors.yaml +++ b/charts/datadog-monitors/templates/datadog-monitors.yaml @@ -1,3 +1,4 @@ +{{- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -}} {{- range $name, $monitor := .Values.monitors }} --- apiVersion: datadoghq.com/v1alpha1 @@ -19,3 +20,4 @@ spec: locked: false {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }} {{- end }} +{{- end }} From a7e36a49efb34b43df38a002e59fc5f496c8b915 Mon Sep 17 00:00:00 2001 From: Shalom Cohen Date: Tue, 5 Nov 2024 16:52:05 -0500 Subject: [PATCH 04/11] Updating docs --- charts/datadog-monitors/README.md | 54 +++++++ .../templates/datadog-monitors.yaml | 2 + charts/datadog-monitors/values.local.yaml | 1 + charts/datadog-monitors/values.yaml | 135 +++++++++++------- 4 files changed, 140 insertions(+), 52 deletions(-) create mode 100644 charts/datadog-monitors/README.md diff --git a/charts/datadog-monitors/README.md b/charts/datadog-monitors/README.md new file mode 100644 index 00000000..baf12ca5 --- /dev/null +++ b/charts/datadog-monitors/README.md @@ -0,0 +1,54 @@ +# datadog-monitors + +![Version: 0.0.1](https://img.shields.io/badge/Version-0.0.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: latest](https://img.shields.io/badge/AppVersion-latest-informational?style=flat-square) + +datadog monitor alerts template + +## Maintainers + +| Name | Email | Url | +| ---- | ------ | --- | +| scohen | | | + +## Requirements + +| Repository | Name | Version | +|------------|------|---------| +| file://../nd-common | nd-common | 0.3.6 | + +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| monitors | `map[string]interface{}` | `{"resourceName":{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}}` | List of monitors | +| monitors.resourceName | `map[string]interface{}` | `{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}` | Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) @section -- monitor | +| monitors.resourceName.disabled | `boolean` | `true` | Optional: whether to exclude the monitor, defaults to False | +| monitors.resourceName.message | `string` | `"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."` | Required: monitor message | +| monitors.resourceName.name | `string` | `"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"` | Require: monitor name | +| monitors.resourceName.options | `map[string]interface{}` | `{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}}` | Optional: monitor options | +| monitors.resourceName.options.evaluationDelay | `string` | `300` | Optional: Time in seconds to wait before evaluating the monitor | +| monitors.resourceName.options.groupbySimpleMonitor | `boolean` | `false` | Optional: A Boolean indicating Whether or not to group by simple monitor, triggers a single alert or multiple alerts when any group breaches the threshold. | +| monitors.resourceName.options.includeTags | `boolean` | `false` | Optional: A Boolean indicating whether notifications from this monitor automatically insert its triggering tags into the title. | +| monitors.resourceName.options.newGroupDelay | `string` | `300` | Optional: Time in seconds to allow a host to boot and applications to fully start before starting the evaluation. | +| monitors.resourceName.options.noDataTimeframe | `int` | `30` | Optional: The number of minutes before a monitor notifies after data stops reporting. Datadog recommends at least 2x the monitor timeframe for metric alerts or 2 minutes for service checks. If omitted, 2x the evaluation timeframe is used for metric alerts, and 24 hours is used for service checks. | +| monitors.resourceName.options.notifyBy | `string[]` | `[]` | Optional: List of labels indicating the granularity for a monitor to alert on. Only available for monitors with groupings. | +| monitors.resourceName.options.notifyNoData | `boolean` | `false` | Optional: A Boolean indicating whether this monitor notifies when data stops reporting. | +| monitors.resourceName.options.renotifyInterval | `int` | `0` | Optional: The number of minutes after the last notification before a monitor re-notifies on the current status. | +| monitors.resourceName.options.renotifyOccurrences | `string[]` | `0` | Optional: The number of times re-notification messages should be sent on the current status at the provided re-notification interval. | +| monitors.resourceName.options.renotifyStatus | `string[]` | `[]` | Optional: The types of statuses for which re-notification messages should be sent(Valid values are alert, warn, no data). | +| monitors.resourceName.options.requireFullWindow | `boolean` | `false` | Optional: A Boolean indicating whether this monitor requires full window of data before it will fire, We highly recommend you set this to false for sparse metrics, otherwise some evaluations are skipped. | +| monitors.resourceName.options.thresholdWindows | `map[string]string` | `{"alertWindow":"5m","recoveryWindow":"10m"}` | Optional: Threshold windows to finetune alerting | +| monitors.resourceName.options.thresholdWindows.alertWindow | `string` | `"5m"` | Optional: Describes how long an anomalous metric must be anomalous before the alert fires. | +| monitors.resourceName.options.thresholdWindows.recoveryWindow | `string` | `"10m"` | Optional: Describes how long an anomalous metric must be normal before the alert recovers. | +| monitors.resourceName.options.thresholds | `map[string]string` | `{"critical":"1","warning":"0.28"}` | Optional: monitor thresholds | +| monitors.resourceName.options.thresholds.critical | `string` | `"1"` | Optional: monitor critical threshold | +| monitors.resourceName.options.thresholds.warning | `string` | `"0.28"` | Optional: monitor warning threshold | +| monitors.resourceName.priority | `string` | `"2"` | Optional: monitor piority | +| monitors.resourceName.query | `string` | `"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10"` | Required: monitor query | +| monitors.resourceName.tags | `string[]` | `["tagname:tagvalue"]` | Optional: Additional monitor tags(will be added on top of the default tags:service, team, namespace, cluster, app_env, app_group) | +| monitors.resourceName.type | `string` | `"query alert"` | Optional: monitor type, if not specified will default to 'query alert' Datadog monitor types to type values mapping: - anomaly: `query alert` - APM: `query alert` or `trace-analytics alert` - composite: `composite` - custom: `service check` - forecast: `query alert` - host: `service check` - integration: `query alert` or `service check` - live process: `process alert` - logs: `log alert` - metric: `query alert` - network: `service check` - outlier: `query alert` - process: `service check` - rum: `rum alert` - SLO: `slo alert` - watchdog: `event-v2 alert` - event-v2: `event-v2 alert` - audit: `audit alert` - error-tracking: `error-tracking alert` - database-monitoring: `database-monitoring alert` - network-performance: `network-performance alert` - service-discovery: `service-discovery alert` | +| serviceName | `string` | `nil` | Optional shared pagerduty service name for monitors, will turn to a tag for alerts - if not provided, the .Release.name will be used by default | +| team | `string` | `nil` | Optional shared pagerduty team name for monitors, will turn to a tag for alerts - if not provided, the tag will not be added | + +---------------------------------------------- +Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0) diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml index f150e826..43bc61f7 100644 --- a/charts/datadog-monitors/templates/datadog-monitors.yaml +++ b/charts/datadog-monitors/templates/datadog-monitors.yaml @@ -1,5 +1,6 @@ {{- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -}} {{- range $name, $monitor := .Values.monitors }} +{{- if not $monitor.disabled }} --- apiVersion: datadoghq.com/v1alpha1 kind: DatadogMonitor @@ -21,3 +22,4 @@ spec: {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }} {{- end }} {{- end }} +{{- end }} diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml index aefa08c2..7034d661 100644 --- a/charts/datadog-monitors/values.local.yaml +++ b/charts/datadog-monitors/values.local.yaml @@ -2,6 +2,7 @@ serviceName: eks team: cloudeng monitors: failed-pods: + disabled: True name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces" message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs." priority: "2" diff --git a/charts/datadog-monitors/values.yaml b/charts/datadog-monitors/values.yaml index 46d60536..f69db66e 100644 --- a/charts/datadog-monitors/values.yaml +++ b/charts/datadog-monitors/values.yaml @@ -1,54 +1,85 @@ # Default monitor values -serviceName: # Default pagerduty service name for the alerts, will turn to a tag for alerts - if not provided, the .Release.name will be used by default -team: # Default pagerduty team name for the alerts, will turn to a tag for alerts - if not provided, the tag will not be added +# -- (`string`) Optional shared pagerduty service name for monitors, will turn to a tag for alerts - if not provided, the .Release.name will be used by default +serviceName: +# -- (`string`) Optional shared pagerduty team name for monitors, will turn to a tag for alerts - if not provided, the tag will not be added +team: -# Placeholder for default datadog monitors -monitors: {} -# failed-pods: # Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) -# name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces" # Required monitor name -# message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs." # Required monitor message -# priority: "2" # Optional alert severity/priority -# query: "change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10" # Required alert query -# type: "query alert" # Optional, defaults to 'query alert'. -# // The type of monitor chosen from: -# // - anomaly: `query alert` -# // - APM: `query alert` or `trace-analytics alert` -# // - composite: `composite` -# // - custom: `service check` -# // - forecast: `query alert` -# // - host: `service check` -# // - integration: `query alert` or `service check` -# // - live process: `process alert` -# // - logs: `log alert` -# // - metric: `query alert` -# // - network: `service check` -# // - outlier: `query alert` -# // - process: `service check` -# // - rum: `rum alert` -# // - SLO: `slo alert` -# // - watchdog: `event-v2 alert` -# // - event-v2: `event-v2 alert` -# // - audit: `audit alert` -# // - error-tracking: `error-tracking alert` -# // - database-monitoring: `database-monitoring alert` -# // - network-performance: `network-performance alert` -# tags: # Optional list of tags (will be added on top of the default tags:service, team, namespace, cluster, app_env, app_group) -# - 'tagname:tagvalue' -# options: # Optional monitor parameters -# thresholds: # Optional alert thresholds -# critical: "1" # Optional critical threshold -# warning: "0.28" # Optional warning threshold, critical threshold will be required if warning is specified -# evaluationDelay: 300 # Time in seconds to wait before evaluating the monitor -# groupbySimpleMonitor: false # Whether or not to group by simple monitor, triggers a single alert or multiple alerts when any group breaches the threshold -# includeTags: false # A Boolean indicating whether notifications from this monitor automatically inserts its triggering tags into the title. -# newGroupDelay: 300 # Time (in seconds) to allow a host to boot and applications to fully start before starting the evaluation -# notifyNoData: false # A Boolean indicating whether this monitor notifies when data stops reporting. -# noDataTimeframe: 30 # The number of minutes before a monitor notifies after data stops reporting. Datadog recommends at least 2x the monitor timeframe for metric alerts or 2 minutes for service checks. If omitted, 2x the evaluation timeframe is used for metric alerts, and 24 hours is used for service checks. -# renotifyInterval: 0 # The number of minutes after the last notification before a monitor re-notifies on the current status. -# renotifyOccurrences: 0 # The number of times re-notification messages should be sent on the current status at the provided re-notification interval. -# renotifyStatus: [] # The types of statuses for which re-notification messages should be sent. Valid values are alert, warn, no data. -# notifyBy: [] # List of labels indicating the granularity for a monitor to alert on. Only available for monitors with groupings. -# requireFullWindow: false # A Boolean indicating whether this monitor requires full window of data before it will fire. We highly recommend you set this to false for sparse metrics, otherwise some evaluations are skipped. -# thresholdWindows: # Threshold windows to finetune alerting -# recoveryWindow: "10m" # Describes how long an anomalous metric must be normal before the alert recovers. -# alertWindow: "5m" # Describes how long an anomalous metric must be anomalous before the alert fires. +# -- (`map[string]interface{}`) List of monitors +monitors: + # -- (`map[string]interface{}`) Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) + # @section -- monitor + resourceName: + # -- (`boolean`) Optional: whether to exclude the monitor, defaults to False + disabled: True + # -- (`string`) Require: monitor name + name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces" # Required monitor name + # -- (`string`) Required: monitor message + message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs." + # -- (`string`) Optional: monitor piority + priority: "2" # Optional alert severity/priority + # -- (`string`) Required: monitor query + query: "change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10" + # -- (`string`) Optional: monitor type, if not specified will default to 'query alert' + # Datadog monitor types to type values mapping: + # - anomaly: `query alert` + # - APM: `query alert` or `trace-analytics alert` + # - composite: `composite` + # - custom: `service check` + # - forecast: `query alert` + # - host: `service check` + # - integration: `query alert` or `service check` + # - live process: `process alert` + # - logs: `log alert` + # - metric: `query alert` + # - network: `service check` + # - outlier: `query alert` + # - process: `service check` + # - rum: `rum alert` + # - SLO: `slo alert` + # - watchdog: `event-v2 alert` + # - event-v2: `event-v2 alert` + # - audit: `audit alert` + # - error-tracking: `error-tracking alert` + # - database-monitoring: `database-monitoring alert` + # - network-performance: `network-performance alert` + # - service-discovery: `service-discovery alert` + type: "query alert" + # -- (`string[]`) Optional: Additional monitor tags(will be added on top of the default tags:service, team, namespace, cluster, app_env, app_group) + tags: + - 'tagname:tagvalue' + # -- (`map[string]interface{}`) Optional: monitor options + options: + # -- (`map[string]string`) Optional: monitor thresholds + thresholds: + # -- (`string`) Optional: monitor critical threshold + critical: "1" + # -- (`string`) Optional: monitor warning threshold + warning: "0.28" + # -- (`string`) Optional: Time in seconds to wait before evaluating the monitor + evaluationDelay: 300 + # -- (`boolean`) Optional: A Boolean indicating Whether or not to group by simple monitor, triggers a single alert or multiple alerts when any group breaches the threshold. + groupbySimpleMonitor: false + # -- (`boolean`) Optional: A Boolean indicating whether notifications from this monitor automatically insert its triggering tags into the title. + includeTags: False + # -- (`string`) Optional: Time in seconds to allow a host to boot and applications to fully start before starting the evaluation. + newGroupDelay: 300 + # -- (`boolean`) Optional: A Boolean indicating whether this monitor notifies when data stops reporting. + notifyNoData: False + # -- (`int`) Optional: The number of minutes before a monitor notifies after data stops reporting. Datadog recommends at least 2x the monitor timeframe for metric alerts or 2 minutes for service checks. If omitted, 2x the evaluation timeframe is used for metric alerts, and 24 hours is used for service checks. + noDataTimeframe: 30 + # -- (`int`) Optional: The number of minutes after the last notification before a monitor re-notifies on the current status. + renotifyInterval: 0 + # -- (`string[]`) Optional: The number of times re-notification messages should be sent on the current status at the provided re-notification interval. + renotifyOccurrences: 0 + # -- (`string[]`) Optional: The types of statuses for which re-notification messages should be sent(Valid values are alert, warn, no data). + renotifyStatus: [] + # -- (`string[]`) Optional: List of labels indicating the granularity for a monitor to alert on. Only available for monitors with groupings. + notifyBy: [] + # -- (`boolean`) Optional: A Boolean indicating whether this monitor requires full window of data before it will fire, We highly recommend you set this to false for sparse metrics, otherwise some evaluations are skipped. + requireFullWindow: false + # -- (`map[string]string`) Optional: Threshold windows to finetune alerting + thresholdWindows: + # -- (`string`) Optional: Describes how long an anomalous metric must be normal before the alert recovers. + recoveryWindow: "10m" + # -- (`string`) Optional: Describes how long an anomalous metric must be anomalous before the alert fires. + alertWindow: "5m" From edc1ad7f3919e7c191e815740d3f0e1394791cb9 Mon Sep 17 00:00:00 2001 From: Shalom Cohen Date: Tue, 5 Nov 2024 16:55:30 -0500 Subject: [PATCH 05/11] Updating docs --- charts/datadog-monitors/values.local.yaml | 2 +- charts/datadog-monitors/values.yaml | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml index 7034d661..290b5b9c 100644 --- a/charts/datadog-monitors/values.local.yaml +++ b/charts/datadog-monitors/values.local.yaml @@ -2,7 +2,7 @@ serviceName: eks team: cloudeng monitors: failed-pods: - disabled: True + disabled: true name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces" message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs." priority: "2" diff --git a/charts/datadog-monitors/values.yaml b/charts/datadog-monitors/values.yaml index f69db66e..8753e367 100644 --- a/charts/datadog-monitors/values.yaml +++ b/charts/datadog-monitors/values.yaml @@ -1,4 +1,3 @@ -# Default monitor values # -- (`string`) Optional shared pagerduty service name for monitors, will turn to a tag for alerts - if not provided, the .Release.name will be used by default serviceName: # -- (`string`) Optional shared pagerduty team name for monitors, will turn to a tag for alerts - if not provided, the tag will not be added @@ -7,16 +6,15 @@ team: # -- (`map[string]interface{}`) List of monitors monitors: # -- (`map[string]interface{}`) Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) - # @section -- monitor resourceName: # -- (`boolean`) Optional: whether to exclude the monitor, defaults to False - disabled: True + disabled: true # -- (`string`) Require: monitor name - name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces" # Required monitor name + name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces" # -- (`string`) Required: monitor message message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs." # -- (`string`) Optional: monitor piority - priority: "2" # Optional alert severity/priority + priority: "2"\ # -- (`string`) Required: monitor query query: "change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10" # -- (`string`) Optional: monitor type, if not specified will default to 'query alert' From 1b7c791bbeb72fd43c9c613c83625283c55c7753 Mon Sep 17 00:00:00 2001 From: Shalom Cohen Date: Tue, 5 Nov 2024 17:09:03 -0500 Subject: [PATCH 06/11] Test with crd --- .../crds/datadoghq.com_datadogmonitors.yaml | 378 ++++++++++++++++++ charts/datadog-monitors/values.local.yaml | 1 + 2 files changed, 379 insertions(+) create mode 100644 charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml diff --git a/charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml b/charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml new file mode 100644 index 00000000..60b95865 --- /dev/null +++ b/charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml @@ -0,0 +1,378 @@ +{{- if .Values.testInstallCrd }} +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + creationTimestamp: null + name: datadogmonitors.datadoghq.com +spec: + group: datadoghq.com + names: + kind: DatadogMonitor + listKind: DatadogMonitorList + plural: datadogmonitors + singular: datadogmonitor + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.id + name: id + type: string + - jsonPath: .status.monitorState + name: monitor state + type: string + - jsonPath: .status.monitorStateLastTransitionTime + name: last state transition + type: string + - format: date + jsonPath: .status.monitorStateLastUpdateTime + name: last state sync + type: string + - jsonPath: .status.syncStatus + name: sync status + type: string + - jsonPath: .metadata.creationTimestamp + name: age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: DatadogMonitor allows to define and manage Monitors from your + Kubernetes Cluster + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: DatadogMonitorSpec defines the desired state of DatadogMonitor + properties: + controllerOptions: + description: ControllerOptions are the optional parameters in the + DatadogMonitor controller + properties: + disableRequiredTags: + description: DisableRequiredTags disables the automatic addition + of required tags to monitors. + type: boolean + type: object + message: + description: Message is a message to include with notifications for + this monitor + type: string + name: + description: Name is the monitor name + type: string + options: + description: Options are the optional parameters associated with your + monitor + properties: + enableLogsSample: + description: A Boolean indicating whether to send a log sample + when the log monitor triggers. + type: boolean + escalationMessage: + description: A message to include with a re-notification. + type: string + evaluationDelay: + description: |- + Time (in seconds) to delay evaluation, as a non-negative integer. For example, if the value is set to 300 (5min), + the timeframe is set to last_5m and the time is 7:00, the monitor evaluates data from 6:50 to 6:55. + This is useful for AWS CloudWatch and other backfilled metrics to ensure the monitor always has data during evaluation. + format: int64 + type: integer + groupbySimpleMonitor: + description: A Boolean indicating whether the log alert monitor + triggers a single alert or multiple alerts when any group breaches + a threshold. + type: boolean + includeTags: + description: A Boolean indicating whether notifications from this + monitor automatically inserts its triggering tags into the title. + type: boolean + locked: + description: 'DEPRECATED: Whether or not the monitor is locked + (only editable by creator and admins). Use `restricted_roles` + instead.' + type: boolean + newGroupDelay: + description: |- + Time (in seconds) to allow a host to boot and applications to fully start before starting the evaluation of + monitor results. Should be a non negative integer. + format: int64 + type: integer + noDataTimeframe: + description: |- + The number of minutes before a monitor notifies after data stops reporting. Datadog recommends at least 2x the + monitor timeframe for metric alerts or 2 minutes for service checks. If omitted, 2x the evaluation timeframe + is used for metric alerts, and 24 hours is used for service checks. + format: int64 + type: integer + notificationPresetName: + description: An enum that toggles the display of additional content + sent in the monitor notification. + type: string + notifyAudit: + description: A Boolean indicating whether tagged users are notified + on changes to this monitor. + type: boolean + notifyBy: + description: |- + A string indicating the granularity a monitor alerts on. Only available for monitors with groupings. + For instance, a monitor grouped by cluster, namespace, and pod can be configured to only notify on each new + cluster violating the alert conditions by setting notify_by to ["cluster"]. Tags mentioned in notify_by must + be a subset of the grouping tags in the query. For example, a query grouped by cluster and namespace cannot + notify on region. Setting notify_by to [*] configures the monitor to notify as a simple-alert. + items: + type: string + type: array + x-kubernetes-list-type: set + notifyNoData: + description: A Boolean indicating whether this monitor notifies + when data stops reporting. + type: boolean + onMissingData: + description: |- + An enum that controls how groups or monitors are treated if an evaluation does not return data points. + The default option results in different behavior depending on the monitor query type. + For monitors using Count queries, an empty monitor evaluation is treated as 0 and is compared to the threshold conditions. + For monitors using any query type other than Count, for example Gauge, Measure, or Rate, the monitor shows the last known status. + This option is only available for APM Trace Analytics, Audit Trail, CI, Error Tracking, Event, Logs, and RUM monitors + type: string + renotifyInterval: + description: |- + The number of minutes after the last notification before a monitor re-notifies on the current status. + It only re-notifies if it’s not resolved. + format: int64 + type: integer + renotifyOccurrences: + description: The number of times re-notification messages should + be sent on the current status at the provided re-notification + interval. + format: int64 + type: integer + renotifyStatuses: + description: The types of statuses for which re-notification messages + should be sent. Valid values are alert, warn, no data. + items: + description: MonitorRenotifyStatusType The different statuses + for which renotification is supported. + type: string + type: array + x-kubernetes-list-type: set + requireFullWindow: + description: |- + A Boolean indicating whether this monitor needs a full window of data before it’s evaluated. We highly + recommend you set this to false for sparse metrics, otherwise some evaluations are skipped. Default is false. + type: boolean + thresholdWindows: + description: A struct of the alerting time window options. + properties: + recoveryWindow: + description: Describes how long an anomalous metric must be + normal before the alert recovers. + type: string + triggerWindow: + description: Describes how long a metric must be anomalous + before an alert triggers. + type: string + type: object + thresholds: + description: A struct of the different monitor threshold values. + properties: + critical: + description: The monitor CRITICAL threshold. + type: string + criticalRecovery: + description: The monitor CRITICAL recovery threshold. + type: string + ok: + description: The monitor OK threshold. + type: string + unknown: + description: The monitor UNKNOWN threshold. + type: string + warning: + description: The monitor WARNING threshold. + type: string + warningRecovery: + description: The monitor WARNING recovery threshold. + type: string + type: object + timeoutH: + description: The number of hours of the monitor not reporting + data before it automatically resolves from a triggered state. + format: int64 + type: integer + type: object + priority: + description: Priority is an integer from 1 (high) to 5 (low) indicating + alert severity + format: int64 + type: integer + query: + description: Query is the Datadog monitor query + type: string + restrictedRoles: + description: |- + RestrictedRoles is a list of unique role identifiers to define which roles are allowed to edit the monitor. + `restricted_roles` is the successor of `locked`. For more information about `locked` and `restricted_roles`, + see the [monitor options docs](https://docs.datadoghq.com/monitors/guide/monitor_api_options/#permissions-options). + items: + type: string + type: array + x-kubernetes-list-type: set + tags: + description: Tags is the monitor tags associated with your monitor + items: + type: string + type: array + x-kubernetes-list-type: set + type: + description: Type is the monitor type + type: string + type: object + status: + description: DatadogMonitorStatus defines the observed state of DatadogMonitor + properties: + conditions: + description: Conditions Represents the latest available observations + of a DatadogMonitor's current state. + items: + description: DatadogMonitorCondition describes the current state + of a DatadogMonitor + properties: + lastTransitionTime: + description: Last time the condition transitioned from one status + to another. + format: date-time + type: string + lastUpdateTime: + description: Last time the condition was updated. + format: date-time + type: string + message: + description: A human readable message indicating details about + the transition. + type: string + reason: + description: The reason for the condition's last transition. + type: string + status: + description: Status of the condition, one of True, False, Unknown. + type: string + type: + description: Type of DatadogMonitor condition + type: string + required: + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + created: + description: Created is the time the monitor was created + format: date-time + type: string + creator: + description: Creator is the identify of the monitor creator + type: string + currentHash: + description: |- + CurrentHash tracks the hash of the current DatadogMonitorSpec to know + if the Spec has changed and needs an update + type: string + downtimeStatus: + description: DowntimeStatus defines whether the monitor is downtimed + properties: + downtimeID: + description: DowntimeID is the downtime ID. + type: integer + isDowntimed: + description: IsDowntimed shows the downtime status of the monitor. + type: boolean + type: object + id: + description: ID is the monitor ID generated in Datadog + type: integer + monitorLastForceSyncTime: + description: MonitorLastForceSyncTime is the last time the API monitor + was last force synced with the DatadogMonitor resource + format: date-time + type: string + monitorState: + description: MonitorState is the overall state of monitor + type: string + monitorStateLastTransitionTime: + description: MonitorStateLastTransitionTime is the last time the monitor + state changed + format: date-time + type: string + monitorStateLastUpdateTime: + description: MonitorStateLastUpdateTime is the last time the monitor + state updated + format: date-time + type: string + monitorStateSyncStatus: + description: MonitorStateSyncStatus shows the health of syncing the + monitor state to Datadog + type: string + primary: + description: |- + Primary defines whether the monitor is managed by the Kubernetes custom + resource (true) or outside Kubernetes (false) + type: boolean + triggeredState: + description: TriggeredState only includes details for monitor groups + that are triggering + items: + description: |- + DatadogMonitorTriggeredState represents the details of a triggering DatadogMonitor + The DatadogMonitor is triggering if one of its groups is in Alert, Warn, or No Data + properties: + lastTransitionTime: + format: date-time + type: string + monitorGroup: + description: MonitorGroup is the name of the triggering group + type: string + state: + description: DatadogMonitorState represents the overall DatadogMonitor + state + type: string + required: + - monitorGroup + type: object + type: array + x-kubernetes-list-map-keys: + - monitorGroup + x-kubernetes-list-type: map + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: null + storedVersions: null +{{- end }} \ No newline at end of file diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml index 290b5b9c..cd5bd506 100644 --- a/charts/datadog-monitors/values.local.yaml +++ b/charts/datadog-monitors/values.local.yaml @@ -1,3 +1,4 @@ +testInstallCrd: true serviceName: eks team: cloudeng monitors: From 7c55e1c675224126c107dc9eaf7fcb061a68b8e2 Mon Sep 17 00:00:00 2001 From: Shalom Cohen Date: Tue, 5 Nov 2024 17:11:04 -0500 Subject: [PATCH 07/11] Test with crd --- charts/datadog-monitors/templates/datadog-monitors.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml index 43bc61f7..adb117a5 100644 --- a/charts/datadog-monitors/templates/datadog-monitors.yaml +++ b/charts/datadog-monitors/templates/datadog-monitors.yaml @@ -1,4 +1,4 @@ -{{- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -}} +{{/*- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -*/}} {{- range $name, $monitor := .Values.monitors }} {{- if not $monitor.disabled }} --- @@ -22,4 +22,4 @@ spec: {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }} {{- end }} {{- end }} -{{- end }} +{{/*- end */}} From d994925e7e5065588fb9f55284243c47091ba5a2 Mon Sep 17 00:00:00 2001 From: Shalom Cohen Date: Tue, 5 Nov 2024 17:24:45 -0500 Subject: [PATCH 08/11] Update --- .../crds/datadoghq.com_datadogmonitors.yaml | 378 ------------------ .../templates/datadog-monitors.yaml | 4 +- charts/datadog-monitors/values.yaml | 2 +- 3 files changed, 3 insertions(+), 381 deletions(-) delete mode 100644 charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml diff --git a/charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml b/charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml deleted file mode 100644 index 60b95865..00000000 --- a/charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml +++ /dev/null @@ -1,378 +0,0 @@ -{{- if .Values.testInstallCrd }} -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.14.0 - creationTimestamp: null - name: datadogmonitors.datadoghq.com -spec: - group: datadoghq.com - names: - kind: DatadogMonitor - listKind: DatadogMonitorList - plural: datadogmonitors - singular: datadogmonitor - scope: Namespaced - versions: - - additionalPrinterColumns: - - jsonPath: .status.id - name: id - type: string - - jsonPath: .status.monitorState - name: monitor state - type: string - - jsonPath: .status.monitorStateLastTransitionTime - name: last state transition - type: string - - format: date - jsonPath: .status.monitorStateLastUpdateTime - name: last state sync - type: string - - jsonPath: .status.syncStatus - name: sync status - type: string - - jsonPath: .metadata.creationTimestamp - name: age - type: date - name: v1alpha1 - schema: - openAPIV3Schema: - description: DatadogMonitor allows to define and manage Monitors from your - Kubernetes Cluster - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: DatadogMonitorSpec defines the desired state of DatadogMonitor - properties: - controllerOptions: - description: ControllerOptions are the optional parameters in the - DatadogMonitor controller - properties: - disableRequiredTags: - description: DisableRequiredTags disables the automatic addition - of required tags to monitors. - type: boolean - type: object - message: - description: Message is a message to include with notifications for - this monitor - type: string - name: - description: Name is the monitor name - type: string - options: - description: Options are the optional parameters associated with your - monitor - properties: - enableLogsSample: - description: A Boolean indicating whether to send a log sample - when the log monitor triggers. - type: boolean - escalationMessage: - description: A message to include with a re-notification. - type: string - evaluationDelay: - description: |- - Time (in seconds) to delay evaluation, as a non-negative integer. For example, if the value is set to 300 (5min), - the timeframe is set to last_5m and the time is 7:00, the monitor evaluates data from 6:50 to 6:55. - This is useful for AWS CloudWatch and other backfilled metrics to ensure the monitor always has data during evaluation. - format: int64 - type: integer - groupbySimpleMonitor: - description: A Boolean indicating whether the log alert monitor - triggers a single alert or multiple alerts when any group breaches - a threshold. - type: boolean - includeTags: - description: A Boolean indicating whether notifications from this - monitor automatically inserts its triggering tags into the title. - type: boolean - locked: - description: 'DEPRECATED: Whether or not the monitor is locked - (only editable by creator and admins). Use `restricted_roles` - instead.' - type: boolean - newGroupDelay: - description: |- - Time (in seconds) to allow a host to boot and applications to fully start before starting the evaluation of - monitor results. Should be a non negative integer. - format: int64 - type: integer - noDataTimeframe: - description: |- - The number of minutes before a monitor notifies after data stops reporting. Datadog recommends at least 2x the - monitor timeframe for metric alerts or 2 minutes for service checks. If omitted, 2x the evaluation timeframe - is used for metric alerts, and 24 hours is used for service checks. - format: int64 - type: integer - notificationPresetName: - description: An enum that toggles the display of additional content - sent in the monitor notification. - type: string - notifyAudit: - description: A Boolean indicating whether tagged users are notified - on changes to this monitor. - type: boolean - notifyBy: - description: |- - A string indicating the granularity a monitor alerts on. Only available for monitors with groupings. - For instance, a monitor grouped by cluster, namespace, and pod can be configured to only notify on each new - cluster violating the alert conditions by setting notify_by to ["cluster"]. Tags mentioned in notify_by must - be a subset of the grouping tags in the query. For example, a query grouped by cluster and namespace cannot - notify on region. Setting notify_by to [*] configures the monitor to notify as a simple-alert. - items: - type: string - type: array - x-kubernetes-list-type: set - notifyNoData: - description: A Boolean indicating whether this monitor notifies - when data stops reporting. - type: boolean - onMissingData: - description: |- - An enum that controls how groups or monitors are treated if an evaluation does not return data points. - The default option results in different behavior depending on the monitor query type. - For monitors using Count queries, an empty monitor evaluation is treated as 0 and is compared to the threshold conditions. - For monitors using any query type other than Count, for example Gauge, Measure, or Rate, the monitor shows the last known status. - This option is only available for APM Trace Analytics, Audit Trail, CI, Error Tracking, Event, Logs, and RUM monitors - type: string - renotifyInterval: - description: |- - The number of minutes after the last notification before a monitor re-notifies on the current status. - It only re-notifies if it’s not resolved. - format: int64 - type: integer - renotifyOccurrences: - description: The number of times re-notification messages should - be sent on the current status at the provided re-notification - interval. - format: int64 - type: integer - renotifyStatuses: - description: The types of statuses for which re-notification messages - should be sent. Valid values are alert, warn, no data. - items: - description: MonitorRenotifyStatusType The different statuses - for which renotification is supported. - type: string - type: array - x-kubernetes-list-type: set - requireFullWindow: - description: |- - A Boolean indicating whether this monitor needs a full window of data before it’s evaluated. We highly - recommend you set this to false for sparse metrics, otherwise some evaluations are skipped. Default is false. - type: boolean - thresholdWindows: - description: A struct of the alerting time window options. - properties: - recoveryWindow: - description: Describes how long an anomalous metric must be - normal before the alert recovers. - type: string - triggerWindow: - description: Describes how long a metric must be anomalous - before an alert triggers. - type: string - type: object - thresholds: - description: A struct of the different monitor threshold values. - properties: - critical: - description: The monitor CRITICAL threshold. - type: string - criticalRecovery: - description: The monitor CRITICAL recovery threshold. - type: string - ok: - description: The monitor OK threshold. - type: string - unknown: - description: The monitor UNKNOWN threshold. - type: string - warning: - description: The monitor WARNING threshold. - type: string - warningRecovery: - description: The monitor WARNING recovery threshold. - type: string - type: object - timeoutH: - description: The number of hours of the monitor not reporting - data before it automatically resolves from a triggered state. - format: int64 - type: integer - type: object - priority: - description: Priority is an integer from 1 (high) to 5 (low) indicating - alert severity - format: int64 - type: integer - query: - description: Query is the Datadog monitor query - type: string - restrictedRoles: - description: |- - RestrictedRoles is a list of unique role identifiers to define which roles are allowed to edit the monitor. - `restricted_roles` is the successor of `locked`. For more information about `locked` and `restricted_roles`, - see the [monitor options docs](https://docs.datadoghq.com/monitors/guide/monitor_api_options/#permissions-options). - items: - type: string - type: array - x-kubernetes-list-type: set - tags: - description: Tags is the monitor tags associated with your monitor - items: - type: string - type: array - x-kubernetes-list-type: set - type: - description: Type is the monitor type - type: string - type: object - status: - description: DatadogMonitorStatus defines the observed state of DatadogMonitor - properties: - conditions: - description: Conditions Represents the latest available observations - of a DatadogMonitor's current state. - items: - description: DatadogMonitorCondition describes the current state - of a DatadogMonitor - properties: - lastTransitionTime: - description: Last time the condition transitioned from one status - to another. - format: date-time - type: string - lastUpdateTime: - description: Last time the condition was updated. - format: date-time - type: string - message: - description: A human readable message indicating details about - the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of DatadogMonitor condition - type: string - required: - - status - - type - type: object - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - created: - description: Created is the time the monitor was created - format: date-time - type: string - creator: - description: Creator is the identify of the monitor creator - type: string - currentHash: - description: |- - CurrentHash tracks the hash of the current DatadogMonitorSpec to know - if the Spec has changed and needs an update - type: string - downtimeStatus: - description: DowntimeStatus defines whether the monitor is downtimed - properties: - downtimeID: - description: DowntimeID is the downtime ID. - type: integer - isDowntimed: - description: IsDowntimed shows the downtime status of the monitor. - type: boolean - type: object - id: - description: ID is the monitor ID generated in Datadog - type: integer - monitorLastForceSyncTime: - description: MonitorLastForceSyncTime is the last time the API monitor - was last force synced with the DatadogMonitor resource - format: date-time - type: string - monitorState: - description: MonitorState is the overall state of monitor - type: string - monitorStateLastTransitionTime: - description: MonitorStateLastTransitionTime is the last time the monitor - state changed - format: date-time - type: string - monitorStateLastUpdateTime: - description: MonitorStateLastUpdateTime is the last time the monitor - state updated - format: date-time - type: string - monitorStateSyncStatus: - description: MonitorStateSyncStatus shows the health of syncing the - monitor state to Datadog - type: string - primary: - description: |- - Primary defines whether the monitor is managed by the Kubernetes custom - resource (true) or outside Kubernetes (false) - type: boolean - triggeredState: - description: TriggeredState only includes details for monitor groups - that are triggering - items: - description: |- - DatadogMonitorTriggeredState represents the details of a triggering DatadogMonitor - The DatadogMonitor is triggering if one of its groups is in Alert, Warn, or No Data - properties: - lastTransitionTime: - format: date-time - type: string - monitorGroup: - description: MonitorGroup is the name of the triggering group - type: string - state: - description: DatadogMonitorState represents the overall DatadogMonitor - state - type: string - required: - - monitorGroup - type: object - type: array - x-kubernetes-list-map-keys: - - monitorGroup - x-kubernetes-list-type: map - type: object - type: object - served: true - storage: true - subresources: - status: {} -status: - acceptedNames: - kind: "" - plural: "" - conditions: null - storedVersions: null -{{- end }} \ No newline at end of file diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml index adb117a5..43bc61f7 100644 --- a/charts/datadog-monitors/templates/datadog-monitors.yaml +++ b/charts/datadog-monitors/templates/datadog-monitors.yaml @@ -1,4 +1,4 @@ -{{/*- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -*/}} +{{- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -}} {{- range $name, $monitor := .Values.monitors }} {{- if not $monitor.disabled }} --- @@ -22,4 +22,4 @@ spec: {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }} {{- end }} {{- end }} -{{/*- end */}} +{{- end }} diff --git a/charts/datadog-monitors/values.yaml b/charts/datadog-monitors/values.yaml index 8753e367..97e12980 100644 --- a/charts/datadog-monitors/values.yaml +++ b/charts/datadog-monitors/values.yaml @@ -14,7 +14,7 @@ monitors: # -- (`string`) Required: monitor message message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs." # -- (`string`) Optional: monitor piority - priority: "2"\ + priority: "2" # -- (`string`) Required: monitor query query: "change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10" # -- (`string`) Optional: monitor type, if not specified will default to 'query alert' From 29228bb82e0d4b3ed5c63863b3a45e7d1958b7aa Mon Sep 17 00:00:00 2001 From: Shalom Cohen Date: Tue, 5 Nov 2024 17:26:24 -0500 Subject: [PATCH 09/11] Update docs --- charts/datadog-monitors/README.md | 2 +- charts/datadog-monitors/values.local.yaml | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/charts/datadog-monitors/README.md b/charts/datadog-monitors/README.md index baf12ca5..d816e755 100644 --- a/charts/datadog-monitors/README.md +++ b/charts/datadog-monitors/README.md @@ -21,7 +21,7 @@ datadog monitor alerts template | Key | Type | Default | Description | |-----|------|---------|-------------| | monitors | `map[string]interface{}` | `{"resourceName":{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}}` | List of monitors | -| monitors.resourceName | `map[string]interface{}` | `{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}` | Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) @section -- monitor | +| monitors.resourceName | `map[string]interface{}` | `{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}` | Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) | | monitors.resourceName.disabled | `boolean` | `true` | Optional: whether to exclude the monitor, defaults to False | | monitors.resourceName.message | `string` | `"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."` | Required: monitor message | | monitors.resourceName.name | `string` | `"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"` | Require: monitor name | diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml index cd5bd506..290b5b9c 100644 --- a/charts/datadog-monitors/values.local.yaml +++ b/charts/datadog-monitors/values.local.yaml @@ -1,4 +1,3 @@ -testInstallCrd: true serviceName: eks team: cloudeng monitors: From 1ea289844a3703be08a6adc151f26f188d81bc03 Mon Sep 17 00:00:00 2001 From: Shalom Cohen Date: Wed, 6 Nov 2024 11:42:03 -0500 Subject: [PATCH 10/11] Update --- charts/datadog-monitors/README.md | 6 +++--- .../datadog-monitors/templates/_helpers.tpl | 19 ++++++++++++------- .../templates/datadog-monitors.yaml | 7 +++---- charts/datadog-monitors/values.local.yaml | 3 ++- charts/datadog-monitors/values.yaml | 4 ++-- 5 files changed, 22 insertions(+), 17 deletions(-) diff --git a/charts/datadog-monitors/README.md b/charts/datadog-monitors/README.md index d816e755..ee802aaa 100644 --- a/charts/datadog-monitors/README.md +++ b/charts/datadog-monitors/README.md @@ -20,8 +20,8 @@ datadog monitor alerts template | Key | Type | Default | Description | |-----|------|---------|-------------| -| monitors | `map[string]interface{}` | `{"resourceName":{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}}` | List of monitors | -| monitors.resourceName | `map[string]interface{}` | `{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}` | Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) | +| monitors | `map[string]interface{}` | `{"resourceName":{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":{"tagname":"tagvalue"},"type":"query alert"}}` | List of monitors | +| monitors.resourceName | `map[string]interface{}` | `{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":{"tagname":"tagvalue"},"type":"query alert"}` | Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) | | monitors.resourceName.disabled | `boolean` | `true` | Optional: whether to exclude the monitor, defaults to False | | monitors.resourceName.message | `string` | `"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."` | Required: monitor message | | monitors.resourceName.name | `string` | `"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"` | Require: monitor name | @@ -45,7 +45,7 @@ datadog monitor alerts template | monitors.resourceName.options.thresholds.warning | `string` | `"0.28"` | Optional: monitor warning threshold | | monitors.resourceName.priority | `string` | `"2"` | Optional: monitor piority | | monitors.resourceName.query | `string` | `"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10"` | Required: monitor query | -| monitors.resourceName.tags | `string[]` | `["tagname:tagvalue"]` | Optional: Additional monitor tags(will be added on top of the default tags:service, team, namespace, cluster, app_env, app_group) | +| monitors.resourceName.tags | `map[string]string` | `{"tagname":"tagvalue"}` | Optional: Additional monitor tags(will be added on top of the default tags:service, team, namespace) | | monitors.resourceName.type | `string` | `"query alert"` | Optional: monitor type, if not specified will default to 'query alert' Datadog monitor types to type values mapping: - anomaly: `query alert` - APM: `query alert` or `trace-analytics alert` - composite: `composite` - custom: `service check` - forecast: `query alert` - host: `service check` - integration: `query alert` or `service check` - live process: `process alert` - logs: `log alert` - metric: `query alert` - network: `service check` - outlier: `query alert` - process: `service check` - rum: `rum alert` - SLO: `slo alert` - watchdog: `event-v2 alert` - event-v2: `event-v2 alert` - audit: `audit alert` - error-tracking: `error-tracking alert` - database-monitoring: `database-monitoring alert` - network-performance: `network-performance alert` - service-discovery: `service-discovery alert` | | serviceName | `string` | `nil` | Optional shared pagerduty service name for monitors, will turn to a tag for alerts - if not provided, the .Release.name will be used by default | | team | `string` | `nil` | Optional shared pagerduty team name for monitors, will turn to a tag for alerts - if not provided, the tag will not be added | diff --git a/charts/datadog-monitors/templates/_helpers.tpl b/charts/datadog-monitors/templates/_helpers.tpl index 80b076b0..cfe2f7fe 100644 --- a/charts/datadog-monitors/templates/_helpers.tpl +++ b/charts/datadog-monitors/templates/_helpers.tpl @@ -1,7 +1,12 @@ -{{- define "datadog-monitors.shared-tags" -}} -- "service:{{ default $.Release.Name .Values.serviceName }}" -- "namespace:{{ .Release.Namespace }}" -{{- with .Values.team }} -- "team:{{ . }}" -{{- end }} -{{- end }} +{{- define "datadog-monitors.tags" -}} +{{- $root := index . 0 -}} +{{- $tags := index . 1 -}} +{{- $sharedtags := dict "service" (default $root.Release.Name $root.Values.serviceName) "namespace" $root.Release.Namespace -}} +{{- with $root.Values.team -}} +{{- $_ := set $sharedtags "team" . -}} +{{- end -}} +{{- $finaltags := mergeOverwrite $sharedtags $tags -}} +{{- range $k, $v := $finaltags -}} +- "{{ $k }}:{{ $v }}" +{{ end -}} +{{- end -}} \ No newline at end of file diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml index 43bc61f7..2553074f 100644 --- a/charts/datadog-monitors/templates/datadog-monitors.yaml +++ b/charts/datadog-monitors/templates/datadog-monitors.yaml @@ -1,4 +1,4 @@ -{{- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -}} +{{/*- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -*/}} {{- range $name, $monitor := .Values.monitors }} {{- if not $monitor.disabled }} --- @@ -15,11 +15,10 @@ spec: {{- with $monitor.priority }} priority: {{.}} {{- end }} - tags: {{ include "datadog-monitors.shared-tags" $ | nindent 4 }} - {{- with $monitor.tags }}{{ toYaml . | nindent 4 }}{{- end }} + tags: {{ include "datadog-monitors.tags" (list $ .tags) | nindent 4 }} options: locked: false {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }} {{- end }} {{- end }} -{{- end }} +{{/*- end */}} diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml index 290b5b9c..000ac5b7 100644 --- a/charts/datadog-monitors/values.local.yaml +++ b/charts/datadog-monitors/values.local.yaml @@ -14,7 +14,8 @@ monitors: name: "Test log alert made from DatadogMonitor" message: "1-2-3 testing" tags: - - "test:datadog" + test: datadog + team: data priority: 5 options: enableLogsSample: true diff --git a/charts/datadog-monitors/values.yaml b/charts/datadog-monitors/values.yaml index 97e12980..0200292f 100644 --- a/charts/datadog-monitors/values.yaml +++ b/charts/datadog-monitors/values.yaml @@ -42,9 +42,9 @@ monitors: # - network-performance: `network-performance alert` # - service-discovery: `service-discovery alert` type: "query alert" - # -- (`string[]`) Optional: Additional monitor tags(will be added on top of the default tags:service, team, namespace, cluster, app_env, app_group) + # -- (`map[string]string`) Optional: Additional monitor tags(will be added on top of the default tags:service, team, namespace) tags: - - 'tagname:tagvalue' + tagname: tagvalue # -- (`map[string]interface{}`) Optional: monitor options options: # -- (`map[string]string`) Optional: monitor thresholds From 36333486d7327c82a6ba3099e2012514ce242374 Mon Sep 17 00:00:00 2001 From: Shalom Cohen Date: Wed, 6 Nov 2024 11:52:20 -0500 Subject: [PATCH 11/11] Reverse the disabled flag to use enabled flag instead --- charts/datadog-monitors/README.md | 6 +++--- charts/datadog-monitors/templates/datadog-monitors.yaml | 6 +++--- charts/datadog-monitors/values.local.yaml | 2 +- charts/datadog-monitors/values.yaml | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/charts/datadog-monitors/README.md b/charts/datadog-monitors/README.md index ee802aaa..6df8abf0 100644 --- a/charts/datadog-monitors/README.md +++ b/charts/datadog-monitors/README.md @@ -20,9 +20,9 @@ datadog monitor alerts template | Key | Type | Default | Description | |-----|------|---------|-------------| -| monitors | `map[string]interface{}` | `{"resourceName":{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":{"tagname":"tagvalue"},"type":"query alert"}}` | List of monitors | -| monitors.resourceName | `map[string]interface{}` | `{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":{"tagname":"tagvalue"},"type":"query alert"}` | Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) | -| monitors.resourceName.disabled | `boolean` | `true` | Optional: whether to exclude the monitor, defaults to False | +| monitors | `map[string]interface{}` | `{"resourceName":{"enabled":false,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":{"tagname":"tagvalue"},"type":"query alert"}}` | List of monitors | +| monitors.resourceName | `map[string]interface{}` | `{"enabled":false,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":{"tagname":"tagvalue"},"type":"query alert"}` | Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) | +| monitors.resourceName.enabled | `boolean` | `false` | Optional: whether to enable the monitor, defaults to true | | monitors.resourceName.message | `string` | `"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."` | Required: monitor message | | monitors.resourceName.name | `string` | `"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"` | Require: monitor name | | monitors.resourceName.options | `map[string]interface{}` | `{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}}` | Optional: monitor options | diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml index 2553074f..607f391e 100644 --- a/charts/datadog-monitors/templates/datadog-monitors.yaml +++ b/charts/datadog-monitors/templates/datadog-monitors.yaml @@ -1,6 +1,6 @@ -{{/*- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -*/}} +{{- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -}} {{- range $name, $monitor := .Values.monitors }} -{{- if not $monitor.disabled }} +{{- if not (eq $monitor.enabled false) }} --- apiVersion: datadoghq.com/v1alpha1 kind: DatadogMonitor @@ -21,4 +21,4 @@ spec: {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }} {{- end }} {{- end }} -{{/*- end */}} +{{- end }} diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml index 000ac5b7..e71755ef 100644 --- a/charts/datadog-monitors/values.local.yaml +++ b/charts/datadog-monitors/values.local.yaml @@ -2,7 +2,7 @@ serviceName: eks team: cloudeng monitors: failed-pods: - disabled: true + enabled: false name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces" message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs." priority: "2" diff --git a/charts/datadog-monitors/values.yaml b/charts/datadog-monitors/values.yaml index 0200292f..c0e4f37b 100644 --- a/charts/datadog-monitors/values.yaml +++ b/charts/datadog-monitors/values.yaml @@ -7,8 +7,8 @@ team: monitors: # -- (`map[string]interface{}`) Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) resourceName: - # -- (`boolean`) Optional: whether to exclude the monitor, defaults to False - disabled: true + # -- (`boolean`) Optional: whether to enable the monitor, defaults to true + enabled: false # -- (`string`) Require: monitor name name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces" # -- (`string`) Required: monitor message