From 9075380f36d3cbafd8e19195649d2a6a06a70e20 Mon Sep 17 00:00:00 2001
From: Shalom Cohen <scohen@nextdoor.com>
Date: Tue, 5 Nov 2024 15:01:19 -0500
Subject: [PATCH 01/11] feat: Adding new datadog monitors chart

---
 charts/datadog-monitors/Chart.yaml            | 13 +++++
 charts/datadog-monitors/ci/local-values.yaml  |  1 +
 .../datadog-monitors/templates/_helpers.tpl   |  7 +++
 .../templates/datadog-monitors.yaml           | 21 ++++++++
 charts/datadog-monitors/values.local.yaml     | 25 +++++++++
 charts/datadog-monitors/values.yaml           | 54 +++++++++++++++++++
 6 files changed, 121 insertions(+)
 create mode 100644 charts/datadog-monitors/Chart.yaml
 create mode 120000 charts/datadog-monitors/ci/local-values.yaml
 create mode 100644 charts/datadog-monitors/templates/_helpers.tpl
 create mode 100644 charts/datadog-monitors/templates/datadog-monitors.yaml
 create mode 100644 charts/datadog-monitors/values.local.yaml
 create mode 100644 charts/datadog-monitors/values.yaml

diff --git a/charts/datadog-monitors/Chart.yaml b/charts/datadog-monitors/Chart.yaml
new file mode 100644
index 00000000..6d382fe9
--- /dev/null
+++ b/charts/datadog-monitors/Chart.yaml
@@ -0,0 +1,13 @@
+apiVersion: v2
+name: datadog-monitors
+description: datadog monitor alerts template
+type: application
+version: 0.0.1
+appVersion: latest
+maintainers:
+  - name: scohen
+    email: scohen@nextdoor.com
+dependencies:
+  - name: nd-common
+    version: 0.3.6
+    repository: file://../nd-common
\ No newline at end of file
diff --git a/charts/datadog-monitors/ci/local-values.yaml b/charts/datadog-monitors/ci/local-values.yaml
new file mode 120000
index 00000000..123e79ec
--- /dev/null
+++ b/charts/datadog-monitors/ci/local-values.yaml
@@ -0,0 +1 @@
+../values.local.yaml
\ No newline at end of file
diff --git a/charts/datadog-monitors/templates/_helpers.tpl b/charts/datadog-monitors/templates/_helpers.tpl
new file mode 100644
index 00000000..018d80d1
--- /dev/null
+++ b/charts/datadog-monitors/templates/_helpers.tpl
@@ -0,0 +1,7 @@
+{{- define "datadog-monitors.shared-tags" -}}
+- "service:{{ default $.Release.Name .Values.serviceName }}"
+- "namespace:{{ .Release.Namespace }}"
+{{- with .Values.team }}
+- "team:{{ . }}"
+{{- end }}
+{{- end }}
\ No newline at end of file
diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml
new file mode 100644
index 00000000..b8a91ecf
--- /dev/null
+++ b/charts/datadog-monitors/templates/datadog-monitors.yaml
@@ -0,0 +1,21 @@
+{{-  range $name, $monitor := .Values.monitors }}
+---
+apiVersion: datadoghq.com/v1alpha1
+kind: DatadogMonitor
+metadata:
+  name: {{ $name }}
+  labels: {{- include "nd-common.labels" $ | nindent 4 }}
+spec:
+  name: {{ $monitor.name | quote }}
+  message: {{ $monitor.message | quote }}
+  query: {{ $monitor.query  | quote }}
+  type: {{ default "query alert" $monitor.type | quote }}
+  {{- with $monitor.priority }}
+  priority: {{.}}
+  {{- end }}
+  tags: {{ include "datadog-monitors.shared-tags" $ | nindent 4 }}
+  {{- with $monitor.tags }}{{ toYaml . | nindent 4 }}{{- end }}
+  options:
+    locked: false
+    {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }}
+{{- end }}
\ No newline at end of file
diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml
new file mode 100644
index 00000000..fd5be215
--- /dev/null
+++ b/charts/datadog-monitors/values.local.yaml
@@ -0,0 +1,25 @@
+serviceName: eks
+team: cloudeng
+monitors:
+  failed-pods:
+    name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"
+    message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."
+    priority: "2"
+    query: "change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10"
+    type: "query alert"
+  datadog-log-alert-test:
+    query: "logs(\"source:nagios AND status:error\").index(\"default\").rollup(\"count\").last(\"1h\") > 5"
+    type: "log alert"
+    name: "Test log alert made from DatadogMonitor"
+    message: "1-2-3 testing"
+    tags:
+      - "test:datadog"
+    priority: 5
+    options:
+      enableLogsSample: true
+      evaluationDelay: 300
+      includeTags: true
+      locked: false
+      notifyNoData: true
+      noDataTimeframe: 30
+      renotifyInterval: 1440
\ No newline at end of file
diff --git a/charts/datadog-monitors/values.yaml b/charts/datadog-monitors/values.yaml
new file mode 100644
index 00000000..dfff32e5
--- /dev/null
+++ b/charts/datadog-monitors/values.yaml
@@ -0,0 +1,54 @@
+# Default monitor values
+serviceName:    # Default pagerduty service name for the alerts, will turn to a tag for alerts - if not provided, the .Release.name will be used by default
+team:           # Default pagerduty team name for the alerts, will turn to a tag for alerts - if not provided, the tag will not be added
+
+# Placeholder for default datadog monitors
+monitors: {}
+#   failed-pods: # Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name)
+#     name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces" # Required monitor name
+#     message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs." # Required monitor message
+#     priority: "2" # Optional alert severity/priority
+#     query: "change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10" # Required alert query
+#     type: "query alert"  # Optional, defaults to 'query alert'.
+# // The type of monitor chosen from:
+# // - anomaly: `query alert`
+# // - APM: `query alert` or `trace-analytics alert`
+# // - composite: `composite`
+# // - custom: `service check`
+# // - forecast: `query alert`
+# // - host: `service check`
+# // - integration: `query alert` or `service check`
+# // - live process: `process alert`
+# // - logs: `log alert`
+# // - metric: `query alert`
+# // - network: `service check`
+# // - outlier: `query alert`
+# // - process: `service check`
+# // - rum: `rum alert`
+# // - SLO: `slo alert`
+# // - watchdog: `event-v2 alert`
+# // - event-v2: `event-v2 alert`
+# // - audit: `audit alert`
+# // - error-tracking: `error-tracking alert`
+# // - database-monitoring: `database-monitoring alert`
+# // - network-performance: `network-performance alert`
+#     tags:  # Optional list of tags (will be added on top of the default tags:service, team, namespace, cluster, app_env, app_group)
+#       - 'tagname:tagvalue'
+#     options: # Optional monitor parameters
+#       thresholds: # Optional alert thresholds
+#         critical: "1" # Optional critical threshold
+#         warning: "0.28" # Optional warning threshold, critical threshold will be required if warning is specified
+#       evaluationDelay: 300 # Time in seconds to wait before evaluating the monitor
+#       groupbySimpleMonitor: false # Whether or not to group by simple monitor, triggers a single alert or multiple alerts when any group breaches the threshold
+#       includeTags: false # A Boolean indicating whether notifications from this monitor automatically inserts its triggering tags into the title.
+#       newGroupDelay: 300 # Time (in seconds) to allow a host to boot and applications to fully start before starting the evaluation
+#       notifyNoData: false # A Boolean indicating whether this monitor notifies when data stops reporting.
+#       noDataTimeframe: 30 # The number of minutes before a monitor notifies after data stops reporting. Datadog recommends at least 2x the monitor timeframe for metric alerts or 2 minutes for service checks. If omitted, 2x the evaluation timeframe is used for metric alerts, and 24 hours is used for service checks.
+#       renotifyInterval: 0 # The number of minutes after the last notification before a monitor re-notifies on the current status.
+#       renotifyOccurrences: 0 # The number of times re-notification messages should be sent on the current status at the provided re-notification interval.
+#       renotifyStatus: [] # The types of statuses for which re-notification messages should be sent. Valid values are alert, warn, no data.
+#       notifyBy: [] # List of labels indicating the granularity for a monitor to alert on. Only available for monitors with groupings.
+#       requireFullWindow: false # A Boolean indicating whether this monitor requires full window of data before it will fire. We highly recommend you set this to false for sparse metrics, otherwise some evaluations are skipped.
+#       thresholdWindows: # Threshold windows to finetune alerting
+#         recoveryWindow: "10m" # Describes how long an anomalous metric must be normal before the alert recovers.
+#         alertWindow: "5m" # Describes how long an anomalous metric must be anomalous before the alert fires.
\ No newline at end of file

From 5bdef9969cc12a27f7f5500e5636849057b8fdcc Mon Sep 17 00:00:00 2001
From: Shalom Cohen <scohen@nextdoor.com>
Date: Tue, 5 Nov 2024 15:03:45 -0500
Subject: [PATCH 02/11] Add new lines at the end of each file

---
 charts/datadog-monitors/Chart.yaml                      | 2 +-
 charts/datadog-monitors/templates/_helpers.tpl          | 2 +-
 charts/datadog-monitors/templates/datadog-monitors.yaml | 2 +-
 charts/datadog-monitors/values.local.yaml               | 2 +-
 charts/datadog-monitors/values.yaml                     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/charts/datadog-monitors/Chart.yaml b/charts/datadog-monitors/Chart.yaml
index 6d382fe9..73597b77 100644
--- a/charts/datadog-monitors/Chart.yaml
+++ b/charts/datadog-monitors/Chart.yaml
@@ -10,4 +10,4 @@ maintainers:
 dependencies:
   - name: nd-common
     version: 0.3.6
-    repository: file://../nd-common
\ No newline at end of file
+    repository: file://../nd-common
diff --git a/charts/datadog-monitors/templates/_helpers.tpl b/charts/datadog-monitors/templates/_helpers.tpl
index 018d80d1..80b076b0 100644
--- a/charts/datadog-monitors/templates/_helpers.tpl
+++ b/charts/datadog-monitors/templates/_helpers.tpl
@@ -4,4 +4,4 @@
 {{- with .Values.team }}
 - "team:{{ . }}"
 {{- end }}
-{{- end }}
\ No newline at end of file
+{{- end }}
diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml
index b8a91ecf..bd802d53 100644
--- a/charts/datadog-monitors/templates/datadog-monitors.yaml
+++ b/charts/datadog-monitors/templates/datadog-monitors.yaml
@@ -18,4 +18,4 @@ spec:
   options:
     locked: false
     {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }}
-{{- end }}
\ No newline at end of file
+{{- end }}
diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml
index fd5be215..aefa08c2 100644
--- a/charts/datadog-monitors/values.local.yaml
+++ b/charts/datadog-monitors/values.local.yaml
@@ -22,4 +22,4 @@ monitors:
       locked: false
       notifyNoData: true
       noDataTimeframe: 30
-      renotifyInterval: 1440
\ No newline at end of file
+      renotifyInterval: 1440
diff --git a/charts/datadog-monitors/values.yaml b/charts/datadog-monitors/values.yaml
index dfff32e5..46d60536 100644
--- a/charts/datadog-monitors/values.yaml
+++ b/charts/datadog-monitors/values.yaml
@@ -51,4 +51,4 @@ monitors: {}
 #       requireFullWindow: false # A Boolean indicating whether this monitor requires full window of data before it will fire. We highly recommend you set this to false for sparse metrics, otherwise some evaluations are skipped.
 #       thresholdWindows: # Threshold windows to finetune alerting
 #         recoveryWindow: "10m" # Describes how long an anomalous metric must be normal before the alert recovers.
-#         alertWindow: "5m" # Describes how long an anomalous metric must be anomalous before the alert fires.
\ No newline at end of file
+#         alertWindow: "5m" # Describes how long an anomalous metric must be anomalous before the alert fires.

From 5d04770f27a8e169f2b2ae02c99f8e4f3f2edca3 Mon Sep 17 00:00:00 2001
From: Shalom Cohen <scohen@nextdoor.com>
Date: Tue, 5 Nov 2024 15:20:54 -0500
Subject: [PATCH 03/11] Making sure to install the datadog monitor CRD only if
 it is available

---
 charts/datadog-monitors/templates/datadog-monitors.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml
index bd802d53..f150e826 100644
--- a/charts/datadog-monitors/templates/datadog-monitors.yaml
+++ b/charts/datadog-monitors/templates/datadog-monitors.yaml
@@ -1,3 +1,4 @@
+{{- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -}}
 {{-  range $name, $monitor := .Values.monitors }}
 ---
 apiVersion: datadoghq.com/v1alpha1
@@ -19,3 +20,4 @@ spec:
     locked: false
     {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }}
 {{- end }}
+{{- end }}

From a7e36a49efb34b43df38a002e59fc5f496c8b915 Mon Sep 17 00:00:00 2001
From: Shalom Cohen <scohen@nextdoor.com>
Date: Tue, 5 Nov 2024 16:52:05 -0500
Subject: [PATCH 04/11] Updating docs

---
 charts/datadog-monitors/README.md             |  54 +++++++
 .../templates/datadog-monitors.yaml           |   2 +
 charts/datadog-monitors/values.local.yaml     |   1 +
 charts/datadog-monitors/values.yaml           | 135 +++++++++++-------
 4 files changed, 140 insertions(+), 52 deletions(-)
 create mode 100644 charts/datadog-monitors/README.md

diff --git a/charts/datadog-monitors/README.md b/charts/datadog-monitors/README.md
new file mode 100644
index 00000000..baf12ca5
--- /dev/null
+++ b/charts/datadog-monitors/README.md
@@ -0,0 +1,54 @@
+# datadog-monitors
+
+![Version: 0.0.1](https://img.shields.io/badge/Version-0.0.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: latest](https://img.shields.io/badge/AppVersion-latest-informational?style=flat-square)
+
+datadog monitor alerts template
+
+## Maintainers
+
+| Name | Email | Url |
+| ---- | ------ | --- |
+| scohen | <scohen@nextdoor.com> |  |
+
+## Requirements
+
+| Repository | Name | Version |
+|------------|------|---------|
+| file://../nd-common | nd-common | 0.3.6 |
+
+## Values
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| monitors | `map[string]interface{}` | `{"resourceName":{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}}` | List of monitors |
+| monitors.resourceName | `map[string]interface{}` | `{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}` | Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) @section -- monitor |
+| monitors.resourceName.disabled | `boolean` | `true` | Optional: whether to exclude the monitor, defaults to False |
+| monitors.resourceName.message | `string` | `"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."` | Required: monitor message |
+| monitors.resourceName.name | `string` | `"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"` | Require: monitor name |
+| monitors.resourceName.options | `map[string]interface{}` | `{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}}` | Optional: monitor options |
+| monitors.resourceName.options.evaluationDelay | `string` | `300` | Optional: Time in seconds to wait before evaluating the monitor |
+| monitors.resourceName.options.groupbySimpleMonitor | `boolean` | `false` | Optional: A Boolean indicating Whether or not to group by simple monitor, triggers a single alert or multiple alerts when any group breaches the threshold. |
+| monitors.resourceName.options.includeTags | `boolean` | `false` | Optional: A Boolean indicating whether notifications from this monitor automatically insert its triggering tags into the title. |
+| monitors.resourceName.options.newGroupDelay | `string` | `300` | Optional: Time in seconds to allow a host to boot and applications to fully start before starting the evaluation. |
+| monitors.resourceName.options.noDataTimeframe | `int` | `30` | Optional: The number of minutes before a monitor notifies after data stops reporting. Datadog recommends at least 2x the monitor timeframe for metric alerts or 2 minutes for service checks. If omitted, 2x the evaluation timeframe is used for metric alerts, and 24 hours is used for service checks. |
+| monitors.resourceName.options.notifyBy | `string[]` | `[]` | Optional: List of labels indicating the granularity for a monitor to alert on. Only available for monitors with groupings. |
+| monitors.resourceName.options.notifyNoData | `boolean` | `false` | Optional: A Boolean indicating whether this monitor notifies when data stops reporting. |
+| monitors.resourceName.options.renotifyInterval | `int` | `0` | Optional: The number of minutes after the last notification before a monitor re-notifies on the current status. |
+| monitors.resourceName.options.renotifyOccurrences | `string[]` | `0` | Optional: The number of times re-notification messages should be sent on the current status at the provided re-notification interval. |
+| monitors.resourceName.options.renotifyStatus | `string[]` | `[]` | Optional: The types of statuses for which re-notification messages should be sent(Valid values are alert, warn, no data). |
+| monitors.resourceName.options.requireFullWindow | `boolean` | `false` | Optional: A Boolean indicating whether this monitor requires full window of data before it will fire, We highly recommend you set this to false for sparse metrics, otherwise some evaluations are skipped. |
+| monitors.resourceName.options.thresholdWindows | `map[string]string` | `{"alertWindow":"5m","recoveryWindow":"10m"}` | Optional: Threshold windows to finetune alerting |
+| monitors.resourceName.options.thresholdWindows.alertWindow | `string` | `"5m"` | Optional: Describes how long an anomalous metric must be anomalous before the alert fires. |
+| monitors.resourceName.options.thresholdWindows.recoveryWindow | `string` | `"10m"` | Optional: Describes how long an anomalous metric must be normal before the alert recovers. |
+| monitors.resourceName.options.thresholds | `map[string]string` | `{"critical":"1","warning":"0.28"}` | Optional: monitor thresholds |
+| monitors.resourceName.options.thresholds.critical | `string` | `"1"` | Optional: monitor critical threshold |
+| monitors.resourceName.options.thresholds.warning | `string` | `"0.28"` | Optional: monitor warning threshold |
+| monitors.resourceName.priority | `string` | `"2"` | Optional: monitor piority |
+| monitors.resourceName.query | `string` | `"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10"` | Required: monitor query |
+| monitors.resourceName.tags | `string[]` | `["tagname:tagvalue"]` | Optional: Additional monitor tags(will be added on top of the default tags:service, team, namespace, cluster, app_env, app_group) |
+| monitors.resourceName.type | `string` | `"query alert"` | Optional: monitor type, if not specified will default to 'query alert' Datadog monitor types to type values mapping: - anomaly: `query alert` - APM: `query alert` or `trace-analytics alert` - composite: `composite` - custom: `service check` - forecast: `query alert` - host: `service check` - integration: `query alert` or `service check` - live process: `process alert` - logs: `log alert` - metric: `query alert` - network: `service check` - outlier: `query alert` - process: `service check` - rum: `rum alert` - SLO: `slo alert` - watchdog: `event-v2 alert` - event-v2: `event-v2 alert` - audit: `audit alert` - error-tracking: `error-tracking alert` - database-monitoring: `database-monitoring alert` - network-performance: `network-performance alert` - service-discovery: `service-discovery alert` |
+| serviceName | `string` | `nil` | Optional shared pagerduty service name for monitors, will turn to a tag for alerts - if not provided, the .Release.name will be used by default |
+| team | `string` | `nil` | Optional shared pagerduty team name for monitors, will turn to a tag for alerts - if not provided, the tag will not be added |
+
+----------------------------------------------
+Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0)
diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml
index f150e826..43bc61f7 100644
--- a/charts/datadog-monitors/templates/datadog-monitors.yaml
+++ b/charts/datadog-monitors/templates/datadog-monitors.yaml
@@ -1,5 +1,6 @@
 {{- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -}}
 {{-  range $name, $monitor := .Values.monitors }}
+{{- if not $monitor.disabled }}
 ---
 apiVersion: datadoghq.com/v1alpha1
 kind: DatadogMonitor
@@ -21,3 +22,4 @@ spec:
     {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }}
 {{- end }}
 {{- end }}
+{{- end }}
diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml
index aefa08c2..7034d661 100644
--- a/charts/datadog-monitors/values.local.yaml
+++ b/charts/datadog-monitors/values.local.yaml
@@ -2,6 +2,7 @@ serviceName: eks
 team: cloudeng
 monitors:
   failed-pods:
+    disabled: True
     name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"
     message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."
     priority: "2"
diff --git a/charts/datadog-monitors/values.yaml b/charts/datadog-monitors/values.yaml
index 46d60536..f69db66e 100644
--- a/charts/datadog-monitors/values.yaml
+++ b/charts/datadog-monitors/values.yaml
@@ -1,54 +1,85 @@
 # Default monitor values
-serviceName:    # Default pagerduty service name for the alerts, will turn to a tag for alerts - if not provided, the .Release.name will be used by default
-team:           # Default pagerduty team name for the alerts, will turn to a tag for alerts - if not provided, the tag will not be added
+# -- (`string`) Optional shared pagerduty service name for monitors, will turn to a tag for alerts - if not provided, the .Release.name will be used by default
+serviceName:
+# -- (`string`) Optional shared pagerduty team name for monitors, will turn to a tag for alerts - if not provided, the tag will not be added
+team:
 
-# Placeholder for default datadog monitors
-monitors: {}
-#   failed-pods: # Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name)
-#     name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces" # Required monitor name
-#     message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs." # Required monitor message
-#     priority: "2" # Optional alert severity/priority
-#     query: "change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10" # Required alert query
-#     type: "query alert"  # Optional, defaults to 'query alert'.
-# // The type of monitor chosen from:
-# // - anomaly: `query alert`
-# // - APM: `query alert` or `trace-analytics alert`
-# // - composite: `composite`
-# // - custom: `service check`
-# // - forecast: `query alert`
-# // - host: `service check`
-# // - integration: `query alert` or `service check`
-# // - live process: `process alert`
-# // - logs: `log alert`
-# // - metric: `query alert`
-# // - network: `service check`
-# // - outlier: `query alert`
-# // - process: `service check`
-# // - rum: `rum alert`
-# // - SLO: `slo alert`
-# // - watchdog: `event-v2 alert`
-# // - event-v2: `event-v2 alert`
-# // - audit: `audit alert`
-# // - error-tracking: `error-tracking alert`
-# // - database-monitoring: `database-monitoring alert`
-# // - network-performance: `network-performance alert`
-#     tags:  # Optional list of tags (will be added on top of the default tags:service, team, namespace, cluster, app_env, app_group)
-#       - 'tagname:tagvalue'
-#     options: # Optional monitor parameters
-#       thresholds: # Optional alert thresholds
-#         critical: "1" # Optional critical threshold
-#         warning: "0.28" # Optional warning threshold, critical threshold will be required if warning is specified
-#       evaluationDelay: 300 # Time in seconds to wait before evaluating the monitor
-#       groupbySimpleMonitor: false # Whether or not to group by simple monitor, triggers a single alert or multiple alerts when any group breaches the threshold
-#       includeTags: false # A Boolean indicating whether notifications from this monitor automatically inserts its triggering tags into the title.
-#       newGroupDelay: 300 # Time (in seconds) to allow a host to boot and applications to fully start before starting the evaluation
-#       notifyNoData: false # A Boolean indicating whether this monitor notifies when data stops reporting.
-#       noDataTimeframe: 30 # The number of minutes before a monitor notifies after data stops reporting. Datadog recommends at least 2x the monitor timeframe for metric alerts or 2 minutes for service checks. If omitted, 2x the evaluation timeframe is used for metric alerts, and 24 hours is used for service checks.
-#       renotifyInterval: 0 # The number of minutes after the last notification before a monitor re-notifies on the current status.
-#       renotifyOccurrences: 0 # The number of times re-notification messages should be sent on the current status at the provided re-notification interval.
-#       renotifyStatus: [] # The types of statuses for which re-notification messages should be sent. Valid values are alert, warn, no data.
-#       notifyBy: [] # List of labels indicating the granularity for a monitor to alert on. Only available for monitors with groupings.
-#       requireFullWindow: false # A Boolean indicating whether this monitor requires full window of data before it will fire. We highly recommend you set this to false for sparse metrics, otherwise some evaluations are skipped.
-#       thresholdWindows: # Threshold windows to finetune alerting
-#         recoveryWindow: "10m" # Describes how long an anomalous metric must be normal before the alert recovers.
-#         alertWindow: "5m" # Describes how long an anomalous metric must be anomalous before the alert fires.
+# -- (`map[string]interface{}`) List of monitors
+monitors:
+  # -- (`map[string]interface{}`) Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name)
+  # @section -- monitor
+  resourceName:
+    # -- (`boolean`) Optional: whether to exclude the monitor, defaults to False
+    disabled: True
+    # -- (`string`) Require: monitor name
+    name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces" # Required monitor name
+    # -- (`string`) Required: monitor message
+    message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."
+    # -- (`string`) Optional: monitor piority
+    priority: "2" # Optional alert severity/priority
+    # -- (`string`) Required: monitor query
+    query: "change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10"
+    # -- (`string`) Optional: monitor type, if not specified will default to 'query alert'
+    # Datadog monitor types to type values mapping:
+    # - anomaly: `query alert`
+    # - APM: `query alert` or `trace-analytics alert`
+    # - composite: `composite`
+    # - custom: `service check`
+    # - forecast: `query alert`
+    # - host: `service check`
+    # - integration: `query alert` or `service check`
+    # - live process: `process alert`
+    # - logs: `log alert`
+    # - metric: `query alert`
+    # - network: `service check`
+    # - outlier: `query alert`
+    # - process: `service check`
+    # - rum: `rum alert`
+    # - SLO: `slo alert`
+    # - watchdog: `event-v2 alert`
+    # - event-v2: `event-v2 alert`
+    # - audit: `audit alert`
+    # - error-tracking: `error-tracking alert`
+    # - database-monitoring: `database-monitoring alert`
+    # - network-performance: `network-performance alert`
+    # - service-discovery: `service-discovery alert`
+    type: "query alert"
+    # -- (`string[]`) Optional: Additional monitor tags(will be added on top of the default tags:service, team, namespace, cluster, app_env, app_group)
+    tags:
+      - 'tagname:tagvalue'
+    # -- (`map[string]interface{}`) Optional: monitor options
+    options:
+      # -- (`map[string]string`) Optional: monitor thresholds
+      thresholds:
+        # -- (`string`) Optional: monitor critical threshold
+        critical: "1"
+        # -- (`string`) Optional: monitor warning threshold
+        warning: "0.28"
+      # -- (`string`) Optional: Time in seconds to wait before evaluating the monitor
+      evaluationDelay: 300
+      # -- (`boolean`) Optional: A Boolean indicating Whether or not to group by simple monitor, triggers a single alert or multiple alerts when any group breaches the threshold.
+      groupbySimpleMonitor: false
+      # -- (`boolean`) Optional: A Boolean indicating whether notifications from this monitor automatically insert its triggering tags into the title.
+      includeTags: False
+      # -- (`string`) Optional: Time in seconds to allow a host to boot and applications to fully start before starting the evaluation.
+      newGroupDelay: 300
+      # -- (`boolean`) Optional: A Boolean indicating whether this monitor notifies when data stops reporting.
+      notifyNoData: False
+      # -- (`int`) Optional: The number of minutes before a monitor notifies after data stops reporting. Datadog recommends at least 2x the monitor timeframe for metric alerts or 2 minutes for service checks. If omitted, 2x the evaluation timeframe is used for metric alerts, and 24 hours is used for service checks.
+      noDataTimeframe: 30
+      # -- (`int`) Optional: The number of minutes after the last notification before a monitor re-notifies on the current status.
+      renotifyInterval: 0
+      # -- (`string[]`) Optional: The number of times re-notification messages should be sent on the current status at the provided re-notification interval.
+      renotifyOccurrences: 0
+      # -- (`string[]`) Optional: The types of statuses for which re-notification messages should be sent(Valid values are alert, warn, no data).
+      renotifyStatus: []
+      # -- (`string[]`) Optional: List of labels indicating the granularity for a monitor to alert on. Only available for monitors with groupings.
+      notifyBy: []
+      # -- (`boolean`) Optional: A Boolean indicating whether this monitor requires full window of data before it will fire, We highly recommend you set this to false for sparse metrics, otherwise some evaluations are skipped.
+      requireFullWindow: false
+      # -- (`map[string]string`) Optional: Threshold windows to finetune alerting
+      thresholdWindows:
+        # -- (`string`) Optional: Describes how long an anomalous metric must be normal before the alert recovers.
+        recoveryWindow: "10m"
+        # -- (`string`) Optional: Describes how long an anomalous metric must be anomalous before the alert fires.
+        alertWindow: "5m"

From edc1ad7f3919e7c191e815740d3f0e1394791cb9 Mon Sep 17 00:00:00 2001
From: Shalom Cohen <scohen@nextdoor.com>
Date: Tue, 5 Nov 2024 16:55:30 -0500
Subject: [PATCH 05/11] Updating docs

---
 charts/datadog-monitors/values.local.yaml | 2 +-
 charts/datadog-monitors/values.yaml       | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml
index 7034d661..290b5b9c 100644
--- a/charts/datadog-monitors/values.local.yaml
+++ b/charts/datadog-monitors/values.local.yaml
@@ -2,7 +2,7 @@ serviceName: eks
 team: cloudeng
 monitors:
   failed-pods:
-    disabled: True
+    disabled: true
     name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"
     message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."
     priority: "2"
diff --git a/charts/datadog-monitors/values.yaml b/charts/datadog-monitors/values.yaml
index f69db66e..8753e367 100644
--- a/charts/datadog-monitors/values.yaml
+++ b/charts/datadog-monitors/values.yaml
@@ -1,4 +1,3 @@
-# Default monitor values
 # -- (`string`) Optional shared pagerduty service name for monitors, will turn to a tag for alerts - if not provided, the .Release.name will be used by default
 serviceName:
 # -- (`string`) Optional shared pagerduty team name for monitors, will turn to a tag for alerts - if not provided, the tag will not be added
@@ -7,16 +6,15 @@ team:
 # -- (`map[string]interface{}`) List of monitors
 monitors:
   # -- (`map[string]interface{}`) Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name)
-  # @section -- monitor
   resourceName:
     # -- (`boolean`) Optional: whether to exclude the monitor, defaults to False
-    disabled: True
+    disabled: true
     # -- (`string`) Require: monitor name
-    name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces" # Required monitor name
+    name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"
     # -- (`string`) Required: monitor message
     message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."
     # -- (`string`) Optional: monitor piority
-    priority: "2" # Optional alert severity/priority
+    priority: "2"\
     # -- (`string`) Required: monitor query
     query: "change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10"
     # -- (`string`) Optional: monitor type, if not specified will default to 'query alert'

From 1b7c791bbeb72fd43c9c613c83625283c55c7753 Mon Sep 17 00:00:00 2001
From: Shalom Cohen <scohen@nextdoor.com>
Date: Tue, 5 Nov 2024 17:09:03 -0500
Subject: [PATCH 06/11] Test with crd

---
 .../crds/datadoghq.com_datadogmonitors.yaml   | 378 ++++++++++++++++++
 charts/datadog-monitors/values.local.yaml     |   1 +
 2 files changed, 379 insertions(+)
 create mode 100644 charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml

diff --git a/charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml b/charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml
new file mode 100644
index 00000000..60b95865
--- /dev/null
+++ b/charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml
@@ -0,0 +1,378 @@
+{{- if .Values.testInstallCrd }}
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.14.0
+  creationTimestamp: null
+  name: datadogmonitors.datadoghq.com
+spec:
+  group: datadoghq.com
+  names:
+    kind: DatadogMonitor
+    listKind: DatadogMonitorList
+    plural: datadogmonitors
+    singular: datadogmonitor
+  scope: Namespaced
+  versions:
+  - additionalPrinterColumns:
+    - jsonPath: .status.id
+      name: id
+      type: string
+    - jsonPath: .status.monitorState
+      name: monitor state
+      type: string
+    - jsonPath: .status.monitorStateLastTransitionTime
+      name: last state transition
+      type: string
+    - format: date
+      jsonPath: .status.monitorStateLastUpdateTime
+      name: last state sync
+      type: string
+    - jsonPath: .status.syncStatus
+      name: sync status
+      type: string
+    - jsonPath: .metadata.creationTimestamp
+      name: age
+      type: date
+    name: v1alpha1
+    schema:
+      openAPIV3Schema:
+        description: DatadogMonitor allows to define and manage Monitors from your
+          Kubernetes Cluster
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: DatadogMonitorSpec defines the desired state of DatadogMonitor
+            properties:
+              controllerOptions:
+                description: ControllerOptions are the optional parameters in the
+                  DatadogMonitor controller
+                properties:
+                  disableRequiredTags:
+                    description: DisableRequiredTags disables the automatic addition
+                      of required tags to monitors.
+                    type: boolean
+                type: object
+              message:
+                description: Message is a message to include with notifications for
+                  this monitor
+                type: string
+              name:
+                description: Name is the monitor name
+                type: string
+              options:
+                description: Options are the optional parameters associated with your
+                  monitor
+                properties:
+                  enableLogsSample:
+                    description: A Boolean indicating whether to send a log sample
+                      when the log monitor triggers.
+                    type: boolean
+                  escalationMessage:
+                    description: A message to include with a re-notification.
+                    type: string
+                  evaluationDelay:
+                    description: |-
+                      Time (in seconds) to delay evaluation, as a non-negative integer. For example, if the value is set to 300 (5min),
+                      the timeframe is set to last_5m and the time is 7:00, the monitor evaluates data from 6:50 to 6:55.
+                      This is useful for AWS CloudWatch and other backfilled metrics to ensure the monitor always has data during evaluation.
+                    format: int64
+                    type: integer
+                  groupbySimpleMonitor:
+                    description: A Boolean indicating whether the log alert monitor
+                      triggers a single alert or multiple alerts when any group breaches
+                      a threshold.
+                    type: boolean
+                  includeTags:
+                    description: A Boolean indicating whether notifications from this
+                      monitor automatically inserts its triggering tags into the title.
+                    type: boolean
+                  locked:
+                    description: 'DEPRECATED: Whether or not the monitor is locked
+                      (only editable by creator and admins). Use `restricted_roles`
+                      instead.'
+                    type: boolean
+                  newGroupDelay:
+                    description: |-
+                      Time (in seconds) to allow a host to boot and applications to fully start before starting the evaluation of
+                      monitor results. Should be a non negative integer.
+                    format: int64
+                    type: integer
+                  noDataTimeframe:
+                    description: |-
+                      The number of minutes before a monitor notifies after data stops reporting. Datadog recommends at least 2x the
+                      monitor timeframe for metric alerts or 2 minutes for service checks. If omitted, 2x the evaluation timeframe
+                      is used for metric alerts, and 24 hours is used for service checks.
+                    format: int64
+                    type: integer
+                  notificationPresetName:
+                    description: An enum that toggles the display of additional content
+                      sent in the monitor notification.
+                    type: string
+                  notifyAudit:
+                    description: A Boolean indicating whether tagged users are notified
+                      on changes to this monitor.
+                    type: boolean
+                  notifyBy:
+                    description: |-
+                      A string indicating the granularity a monitor alerts on. Only available for monitors with groupings.
+                      For instance, a monitor grouped by cluster, namespace, and pod can be configured to only notify on each new
+                      cluster violating the alert conditions by setting notify_by to ["cluster"]. Tags mentioned in notify_by must
+                      be a subset of the grouping tags in the query. For example, a query grouped by cluster and namespace cannot
+                      notify on region. Setting notify_by to [*] configures the monitor to notify as a simple-alert.
+                    items:
+                      type: string
+                    type: array
+                    x-kubernetes-list-type: set
+                  notifyNoData:
+                    description: A Boolean indicating whether this monitor notifies
+                      when data stops reporting.
+                    type: boolean
+                  onMissingData:
+                    description: |-
+                      An enum that controls how groups or monitors are treated if an evaluation does not return data points.
+                      The default option results in different behavior depending on the monitor query type.
+                      For monitors using Count queries, an empty monitor evaluation is treated as 0 and is compared to the threshold conditions.
+                      For monitors using any query type other than Count, for example Gauge, Measure, or Rate, the monitor shows the last known status.
+                      This option is only available for APM Trace Analytics, Audit Trail, CI, Error Tracking, Event, Logs, and RUM monitors
+                    type: string
+                  renotifyInterval:
+                    description: |-
+                      The number of minutes after the last notification before a monitor re-notifies on the current status.
+                      It only re-notifies if it’s not resolved.
+                    format: int64
+                    type: integer
+                  renotifyOccurrences:
+                    description: The number of times re-notification messages should
+                      be sent on the current status at the provided re-notification
+                      interval.
+                    format: int64
+                    type: integer
+                  renotifyStatuses:
+                    description: The types of statuses for which re-notification messages
+                      should be sent. Valid values are alert, warn, no data.
+                    items:
+                      description: MonitorRenotifyStatusType The different statuses
+                        for which renotification is supported.
+                      type: string
+                    type: array
+                    x-kubernetes-list-type: set
+                  requireFullWindow:
+                    description: |-
+                      A Boolean indicating whether this monitor needs a full window of data before it’s evaluated. We highly
+                      recommend you set this to false for sparse metrics, otherwise some evaluations are skipped. Default is false.
+                    type: boolean
+                  thresholdWindows:
+                    description: A struct of the alerting time window options.
+                    properties:
+                      recoveryWindow:
+                        description: Describes how long an anomalous metric must be
+                          normal before the alert recovers.
+                        type: string
+                      triggerWindow:
+                        description: Describes how long a metric must be anomalous
+                          before an alert triggers.
+                        type: string
+                    type: object
+                  thresholds:
+                    description: A struct of the different monitor threshold values.
+                    properties:
+                      critical:
+                        description: The monitor CRITICAL threshold.
+                        type: string
+                      criticalRecovery:
+                        description: The monitor CRITICAL recovery threshold.
+                        type: string
+                      ok:
+                        description: The monitor OK threshold.
+                        type: string
+                      unknown:
+                        description: The monitor UNKNOWN threshold.
+                        type: string
+                      warning:
+                        description: The monitor WARNING threshold.
+                        type: string
+                      warningRecovery:
+                        description: The monitor WARNING recovery threshold.
+                        type: string
+                    type: object
+                  timeoutH:
+                    description: The number of hours of the monitor not reporting
+                      data before it automatically resolves from a triggered state.
+                    format: int64
+                    type: integer
+                type: object
+              priority:
+                description: Priority is an integer from 1 (high) to 5 (low) indicating
+                  alert severity
+                format: int64
+                type: integer
+              query:
+                description: Query is the Datadog monitor query
+                type: string
+              restrictedRoles:
+                description: |-
+                  RestrictedRoles is a list of unique role identifiers to define which roles are allowed to edit the monitor.
+                  `restricted_roles` is the successor of `locked`. For more information about `locked` and `restricted_roles`,
+                  see the [monitor options docs](https://docs.datadoghq.com/monitors/guide/monitor_api_options/#permissions-options).
+                items:
+                  type: string
+                type: array
+                x-kubernetes-list-type: set
+              tags:
+                description: Tags is the monitor tags associated with your monitor
+                items:
+                  type: string
+                type: array
+                x-kubernetes-list-type: set
+              type:
+                description: Type is the monitor type
+                type: string
+            type: object
+          status:
+            description: DatadogMonitorStatus defines the observed state of DatadogMonitor
+            properties:
+              conditions:
+                description: Conditions Represents the latest available observations
+                  of a DatadogMonitor's current state.
+                items:
+                  description: DatadogMonitorCondition describes the current state
+                    of a DatadogMonitor
+                  properties:
+                    lastTransitionTime:
+                      description: Last time the condition transitioned from one status
+                        to another.
+                      format: date-time
+                      type: string
+                    lastUpdateTime:
+                      description: Last time the condition was updated.
+                      format: date-time
+                      type: string
+                    message:
+                      description: A human readable message indicating details about
+                        the transition.
+                      type: string
+                    reason:
+                      description: The reason for the condition's last transition.
+                      type: string
+                    status:
+                      description: Status of the condition, one of True, False, Unknown.
+                      type: string
+                    type:
+                      description: Type of DatadogMonitor condition
+                      type: string
+                  required:
+                  - status
+                  - type
+                  type: object
+                type: array
+                x-kubernetes-list-map-keys:
+                - type
+                x-kubernetes-list-type: map
+              created:
+                description: Created is the time the monitor was created
+                format: date-time
+                type: string
+              creator:
+                description: Creator is the identify of the monitor creator
+                type: string
+              currentHash:
+                description: |-
+                  CurrentHash tracks the hash of the current DatadogMonitorSpec to know
+                  if the Spec has changed and needs an update
+                type: string
+              downtimeStatus:
+                description: DowntimeStatus defines whether the monitor is downtimed
+                properties:
+                  downtimeID:
+                    description: DowntimeID is the downtime ID.
+                    type: integer
+                  isDowntimed:
+                    description: IsDowntimed shows the downtime status of the monitor.
+                    type: boolean
+                type: object
+              id:
+                description: ID is the monitor ID generated in Datadog
+                type: integer
+              monitorLastForceSyncTime:
+                description: MonitorLastForceSyncTime is the last time the API monitor
+                  was last force synced with the DatadogMonitor resource
+                format: date-time
+                type: string
+              monitorState:
+                description: MonitorState is the overall state of monitor
+                type: string
+              monitorStateLastTransitionTime:
+                description: MonitorStateLastTransitionTime is the last time the monitor
+                  state changed
+                format: date-time
+                type: string
+              monitorStateLastUpdateTime:
+                description: MonitorStateLastUpdateTime is the last time the monitor
+                  state updated
+                format: date-time
+                type: string
+              monitorStateSyncStatus:
+                description: MonitorStateSyncStatus shows the health of syncing the
+                  monitor state to Datadog
+                type: string
+              primary:
+                description: |-
+                  Primary defines whether the monitor is managed by the Kubernetes custom
+                  resource (true) or outside Kubernetes (false)
+                type: boolean
+              triggeredState:
+                description: TriggeredState only includes details for monitor groups
+                  that are triggering
+                items:
+                  description: |-
+                    DatadogMonitorTriggeredState represents the details of a triggering DatadogMonitor
+                    The DatadogMonitor is triggering if one of its groups is in Alert, Warn, or No Data
+                  properties:
+                    lastTransitionTime:
+                      format: date-time
+                      type: string
+                    monitorGroup:
+                      description: MonitorGroup is the name of the triggering group
+                      type: string
+                    state:
+                      description: DatadogMonitorState represents the overall DatadogMonitor
+                        state
+                      type: string
+                  required:
+                  - monitorGroup
+                  type: object
+                type: array
+                x-kubernetes-list-map-keys:
+                - monitorGroup
+                x-kubernetes-list-type: map
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources:
+      status: {}
+status:
+  acceptedNames:
+    kind: ""
+    plural: ""
+  conditions: null
+  storedVersions: null
+{{- end }}
\ No newline at end of file
diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml
index 290b5b9c..cd5bd506 100644
--- a/charts/datadog-monitors/values.local.yaml
+++ b/charts/datadog-monitors/values.local.yaml
@@ -1,3 +1,4 @@
+testInstallCrd: true
 serviceName: eks
 team: cloudeng
 monitors:

From 7c55e1c675224126c107dc9eaf7fcb061a68b8e2 Mon Sep 17 00:00:00 2001
From: Shalom Cohen <scohen@nextdoor.com>
Date: Tue, 5 Nov 2024 17:11:04 -0500
Subject: [PATCH 07/11] Test with crd

---
 charts/datadog-monitors/templates/datadog-monitors.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml
index 43bc61f7..adb117a5 100644
--- a/charts/datadog-monitors/templates/datadog-monitors.yaml
+++ b/charts/datadog-monitors/templates/datadog-monitors.yaml
@@ -1,4 +1,4 @@
-{{- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -}}
+{{/*- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -*/}}
 {{-  range $name, $monitor := .Values.monitors }}
 {{- if not $monitor.disabled }}
 ---
@@ -22,4 +22,4 @@ spec:
     {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }}
 {{- end }}
 {{- end }}
-{{- end }}
+{{/*- end */}}

From d994925e7e5065588fb9f55284243c47091ba5a2 Mon Sep 17 00:00:00 2001
From: Shalom Cohen <scohen@nextdoor.com>
Date: Tue, 5 Nov 2024 17:24:45 -0500
Subject: [PATCH 08/11] Update

---
 .../crds/datadoghq.com_datadogmonitors.yaml   | 378 ------------------
 .../templates/datadog-monitors.yaml           |   4 +-
 charts/datadog-monitors/values.yaml           |   2 +-
 3 files changed, 3 insertions(+), 381 deletions(-)
 delete mode 100644 charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml

diff --git a/charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml b/charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml
deleted file mode 100644
index 60b95865..00000000
--- a/charts/datadog-monitors/crds/datadoghq.com_datadogmonitors.yaml
+++ /dev/null
@@ -1,378 +0,0 @@
-{{- if .Values.testInstallCrd }}
-apiVersion: apiextensions.k8s.io/v1
-kind: CustomResourceDefinition
-metadata:
-  annotations:
-    controller-gen.kubebuilder.io/version: v0.14.0
-  creationTimestamp: null
-  name: datadogmonitors.datadoghq.com
-spec:
-  group: datadoghq.com
-  names:
-    kind: DatadogMonitor
-    listKind: DatadogMonitorList
-    plural: datadogmonitors
-    singular: datadogmonitor
-  scope: Namespaced
-  versions:
-  - additionalPrinterColumns:
-    - jsonPath: .status.id
-      name: id
-      type: string
-    - jsonPath: .status.monitorState
-      name: monitor state
-      type: string
-    - jsonPath: .status.monitorStateLastTransitionTime
-      name: last state transition
-      type: string
-    - format: date
-      jsonPath: .status.monitorStateLastUpdateTime
-      name: last state sync
-      type: string
-    - jsonPath: .status.syncStatus
-      name: sync status
-      type: string
-    - jsonPath: .metadata.creationTimestamp
-      name: age
-      type: date
-    name: v1alpha1
-    schema:
-      openAPIV3Schema:
-        description: DatadogMonitor allows to define and manage Monitors from your
-          Kubernetes Cluster
-        properties:
-          apiVersion:
-            description: |-
-              APIVersion defines the versioned schema of this representation of an object.
-              Servers should convert recognized schemas to the latest internal value, and
-              may reject unrecognized values.
-              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
-            type: string
-          kind:
-            description: |-
-              Kind is a string value representing the REST resource this object represents.
-              Servers may infer this from the endpoint the client submits requests to.
-              Cannot be updated.
-              In CamelCase.
-              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
-            type: string
-          metadata:
-            type: object
-          spec:
-            description: DatadogMonitorSpec defines the desired state of DatadogMonitor
-            properties:
-              controllerOptions:
-                description: ControllerOptions are the optional parameters in the
-                  DatadogMonitor controller
-                properties:
-                  disableRequiredTags:
-                    description: DisableRequiredTags disables the automatic addition
-                      of required tags to monitors.
-                    type: boolean
-                type: object
-              message:
-                description: Message is a message to include with notifications for
-                  this monitor
-                type: string
-              name:
-                description: Name is the monitor name
-                type: string
-              options:
-                description: Options are the optional parameters associated with your
-                  monitor
-                properties:
-                  enableLogsSample:
-                    description: A Boolean indicating whether to send a log sample
-                      when the log monitor triggers.
-                    type: boolean
-                  escalationMessage:
-                    description: A message to include with a re-notification.
-                    type: string
-                  evaluationDelay:
-                    description: |-
-                      Time (in seconds) to delay evaluation, as a non-negative integer. For example, if the value is set to 300 (5min),
-                      the timeframe is set to last_5m and the time is 7:00, the monitor evaluates data from 6:50 to 6:55.
-                      This is useful for AWS CloudWatch and other backfilled metrics to ensure the monitor always has data during evaluation.
-                    format: int64
-                    type: integer
-                  groupbySimpleMonitor:
-                    description: A Boolean indicating whether the log alert monitor
-                      triggers a single alert or multiple alerts when any group breaches
-                      a threshold.
-                    type: boolean
-                  includeTags:
-                    description: A Boolean indicating whether notifications from this
-                      monitor automatically inserts its triggering tags into the title.
-                    type: boolean
-                  locked:
-                    description: 'DEPRECATED: Whether or not the monitor is locked
-                      (only editable by creator and admins). Use `restricted_roles`
-                      instead.'
-                    type: boolean
-                  newGroupDelay:
-                    description: |-
-                      Time (in seconds) to allow a host to boot and applications to fully start before starting the evaluation of
-                      monitor results. Should be a non negative integer.
-                    format: int64
-                    type: integer
-                  noDataTimeframe:
-                    description: |-
-                      The number of minutes before a monitor notifies after data stops reporting. Datadog recommends at least 2x the
-                      monitor timeframe for metric alerts or 2 minutes for service checks. If omitted, 2x the evaluation timeframe
-                      is used for metric alerts, and 24 hours is used for service checks.
-                    format: int64
-                    type: integer
-                  notificationPresetName:
-                    description: An enum that toggles the display of additional content
-                      sent in the monitor notification.
-                    type: string
-                  notifyAudit:
-                    description: A Boolean indicating whether tagged users are notified
-                      on changes to this monitor.
-                    type: boolean
-                  notifyBy:
-                    description: |-
-                      A string indicating the granularity a monitor alerts on. Only available for monitors with groupings.
-                      For instance, a monitor grouped by cluster, namespace, and pod can be configured to only notify on each new
-                      cluster violating the alert conditions by setting notify_by to ["cluster"]. Tags mentioned in notify_by must
-                      be a subset of the grouping tags in the query. For example, a query grouped by cluster and namespace cannot
-                      notify on region. Setting notify_by to [*] configures the monitor to notify as a simple-alert.
-                    items:
-                      type: string
-                    type: array
-                    x-kubernetes-list-type: set
-                  notifyNoData:
-                    description: A Boolean indicating whether this monitor notifies
-                      when data stops reporting.
-                    type: boolean
-                  onMissingData:
-                    description: |-
-                      An enum that controls how groups or monitors are treated if an evaluation does not return data points.
-                      The default option results in different behavior depending on the monitor query type.
-                      For monitors using Count queries, an empty monitor evaluation is treated as 0 and is compared to the threshold conditions.
-                      For monitors using any query type other than Count, for example Gauge, Measure, or Rate, the monitor shows the last known status.
-                      This option is only available for APM Trace Analytics, Audit Trail, CI, Error Tracking, Event, Logs, and RUM monitors
-                    type: string
-                  renotifyInterval:
-                    description: |-
-                      The number of minutes after the last notification before a monitor re-notifies on the current status.
-                      It only re-notifies if it’s not resolved.
-                    format: int64
-                    type: integer
-                  renotifyOccurrences:
-                    description: The number of times re-notification messages should
-                      be sent on the current status at the provided re-notification
-                      interval.
-                    format: int64
-                    type: integer
-                  renotifyStatuses:
-                    description: The types of statuses for which re-notification messages
-                      should be sent. Valid values are alert, warn, no data.
-                    items:
-                      description: MonitorRenotifyStatusType The different statuses
-                        for which renotification is supported.
-                      type: string
-                    type: array
-                    x-kubernetes-list-type: set
-                  requireFullWindow:
-                    description: |-
-                      A Boolean indicating whether this monitor needs a full window of data before it’s evaluated. We highly
-                      recommend you set this to false for sparse metrics, otherwise some evaluations are skipped. Default is false.
-                    type: boolean
-                  thresholdWindows:
-                    description: A struct of the alerting time window options.
-                    properties:
-                      recoveryWindow:
-                        description: Describes how long an anomalous metric must be
-                          normal before the alert recovers.
-                        type: string
-                      triggerWindow:
-                        description: Describes how long a metric must be anomalous
-                          before an alert triggers.
-                        type: string
-                    type: object
-                  thresholds:
-                    description: A struct of the different monitor threshold values.
-                    properties:
-                      critical:
-                        description: The monitor CRITICAL threshold.
-                        type: string
-                      criticalRecovery:
-                        description: The monitor CRITICAL recovery threshold.
-                        type: string
-                      ok:
-                        description: The monitor OK threshold.
-                        type: string
-                      unknown:
-                        description: The monitor UNKNOWN threshold.
-                        type: string
-                      warning:
-                        description: The monitor WARNING threshold.
-                        type: string
-                      warningRecovery:
-                        description: The monitor WARNING recovery threshold.
-                        type: string
-                    type: object
-                  timeoutH:
-                    description: The number of hours of the monitor not reporting
-                      data before it automatically resolves from a triggered state.
-                    format: int64
-                    type: integer
-                type: object
-              priority:
-                description: Priority is an integer from 1 (high) to 5 (low) indicating
-                  alert severity
-                format: int64
-                type: integer
-              query:
-                description: Query is the Datadog monitor query
-                type: string
-              restrictedRoles:
-                description: |-
-                  RestrictedRoles is a list of unique role identifiers to define which roles are allowed to edit the monitor.
-                  `restricted_roles` is the successor of `locked`. For more information about `locked` and `restricted_roles`,
-                  see the [monitor options docs](https://docs.datadoghq.com/monitors/guide/monitor_api_options/#permissions-options).
-                items:
-                  type: string
-                type: array
-                x-kubernetes-list-type: set
-              tags:
-                description: Tags is the monitor tags associated with your monitor
-                items:
-                  type: string
-                type: array
-                x-kubernetes-list-type: set
-              type:
-                description: Type is the monitor type
-                type: string
-            type: object
-          status:
-            description: DatadogMonitorStatus defines the observed state of DatadogMonitor
-            properties:
-              conditions:
-                description: Conditions Represents the latest available observations
-                  of a DatadogMonitor's current state.
-                items:
-                  description: DatadogMonitorCondition describes the current state
-                    of a DatadogMonitor
-                  properties:
-                    lastTransitionTime:
-                      description: Last time the condition transitioned from one status
-                        to another.
-                      format: date-time
-                      type: string
-                    lastUpdateTime:
-                      description: Last time the condition was updated.
-                      format: date-time
-                      type: string
-                    message:
-                      description: A human readable message indicating details about
-                        the transition.
-                      type: string
-                    reason:
-                      description: The reason for the condition's last transition.
-                      type: string
-                    status:
-                      description: Status of the condition, one of True, False, Unknown.
-                      type: string
-                    type:
-                      description: Type of DatadogMonitor condition
-                      type: string
-                  required:
-                  - status
-                  - type
-                  type: object
-                type: array
-                x-kubernetes-list-map-keys:
-                - type
-                x-kubernetes-list-type: map
-              created:
-                description: Created is the time the monitor was created
-                format: date-time
-                type: string
-              creator:
-                description: Creator is the identify of the monitor creator
-                type: string
-              currentHash:
-                description: |-
-                  CurrentHash tracks the hash of the current DatadogMonitorSpec to know
-                  if the Spec has changed and needs an update
-                type: string
-              downtimeStatus:
-                description: DowntimeStatus defines whether the monitor is downtimed
-                properties:
-                  downtimeID:
-                    description: DowntimeID is the downtime ID.
-                    type: integer
-                  isDowntimed:
-                    description: IsDowntimed shows the downtime status of the monitor.
-                    type: boolean
-                type: object
-              id:
-                description: ID is the monitor ID generated in Datadog
-                type: integer
-              monitorLastForceSyncTime:
-                description: MonitorLastForceSyncTime is the last time the API monitor
-                  was last force synced with the DatadogMonitor resource
-                format: date-time
-                type: string
-              monitorState:
-                description: MonitorState is the overall state of monitor
-                type: string
-              monitorStateLastTransitionTime:
-                description: MonitorStateLastTransitionTime is the last time the monitor
-                  state changed
-                format: date-time
-                type: string
-              monitorStateLastUpdateTime:
-                description: MonitorStateLastUpdateTime is the last time the monitor
-                  state updated
-                format: date-time
-                type: string
-              monitorStateSyncStatus:
-                description: MonitorStateSyncStatus shows the health of syncing the
-                  monitor state to Datadog
-                type: string
-              primary:
-                description: |-
-                  Primary defines whether the monitor is managed by the Kubernetes custom
-                  resource (true) or outside Kubernetes (false)
-                type: boolean
-              triggeredState:
-                description: TriggeredState only includes details for monitor groups
-                  that are triggering
-                items:
-                  description: |-
-                    DatadogMonitorTriggeredState represents the details of a triggering DatadogMonitor
-                    The DatadogMonitor is triggering if one of its groups is in Alert, Warn, or No Data
-                  properties:
-                    lastTransitionTime:
-                      format: date-time
-                      type: string
-                    monitorGroup:
-                      description: MonitorGroup is the name of the triggering group
-                      type: string
-                    state:
-                      description: DatadogMonitorState represents the overall DatadogMonitor
-                        state
-                      type: string
-                  required:
-                  - monitorGroup
-                  type: object
-                type: array
-                x-kubernetes-list-map-keys:
-                - monitorGroup
-                x-kubernetes-list-type: map
-            type: object
-        type: object
-    served: true
-    storage: true
-    subresources:
-      status: {}
-status:
-  acceptedNames:
-    kind: ""
-    plural: ""
-  conditions: null
-  storedVersions: null
-{{- end }}
\ No newline at end of file
diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml
index adb117a5..43bc61f7 100644
--- a/charts/datadog-monitors/templates/datadog-monitors.yaml
+++ b/charts/datadog-monitors/templates/datadog-monitors.yaml
@@ -1,4 +1,4 @@
-{{/*- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -*/}}
+{{- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -}}
 {{-  range $name, $monitor := .Values.monitors }}
 {{- if not $monitor.disabled }}
 ---
@@ -22,4 +22,4 @@ spec:
     {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }}
 {{- end }}
 {{- end }}
-{{/*- end */}}
+{{- end }}
diff --git a/charts/datadog-monitors/values.yaml b/charts/datadog-monitors/values.yaml
index 8753e367..97e12980 100644
--- a/charts/datadog-monitors/values.yaml
+++ b/charts/datadog-monitors/values.yaml
@@ -14,7 +14,7 @@ monitors:
     # -- (`string`) Required: monitor message
     message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."
     # -- (`string`) Optional: monitor piority
-    priority: "2"\
+    priority: "2"
     # -- (`string`) Required: monitor query
     query: "change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10"
     # -- (`string`) Optional: monitor type, if not specified will default to 'query alert'

From 29228bb82e0d4b3ed5c63863b3a45e7d1958b7aa Mon Sep 17 00:00:00 2001
From: Shalom Cohen <scohen@nextdoor.com>
Date: Tue, 5 Nov 2024 17:26:24 -0500
Subject: [PATCH 09/11] Update docs

---
 charts/datadog-monitors/README.md         | 2 +-
 charts/datadog-monitors/values.local.yaml | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/charts/datadog-monitors/README.md b/charts/datadog-monitors/README.md
index baf12ca5..d816e755 100644
--- a/charts/datadog-monitors/README.md
+++ b/charts/datadog-monitors/README.md
@@ -21,7 +21,7 @@ datadog monitor alerts template
 | Key | Type | Default | Description |
 |-----|------|---------|-------------|
 | monitors | `map[string]interface{}` | `{"resourceName":{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}}` | List of monitors |
-| monitors.resourceName | `map[string]interface{}` | `{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}` | Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) @section -- monitor |
+| monitors.resourceName | `map[string]interface{}` | `{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}` | Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) |
 | monitors.resourceName.disabled | `boolean` | `true` | Optional: whether to exclude the monitor, defaults to False |
 | monitors.resourceName.message | `string` | `"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."` | Required: monitor message |
 | monitors.resourceName.name | `string` | `"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"` | Require: monitor name |
diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml
index cd5bd506..290b5b9c 100644
--- a/charts/datadog-monitors/values.local.yaml
+++ b/charts/datadog-monitors/values.local.yaml
@@ -1,4 +1,3 @@
-testInstallCrd: true
 serviceName: eks
 team: cloudeng
 monitors:

From 1ea289844a3703be08a6adc151f26f188d81bc03 Mon Sep 17 00:00:00 2001
From: Shalom Cohen <scohen@nextdoor.com>
Date: Wed, 6 Nov 2024 11:42:03 -0500
Subject: [PATCH 10/11] Update

---
 charts/datadog-monitors/README.md             |  6 +++---
 .../datadog-monitors/templates/_helpers.tpl   | 19 ++++++++++++-------
 .../templates/datadog-monitors.yaml           |  7 +++----
 charts/datadog-monitors/values.local.yaml     |  3 ++-
 charts/datadog-monitors/values.yaml           |  4 ++--
 5 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/charts/datadog-monitors/README.md b/charts/datadog-monitors/README.md
index d816e755..ee802aaa 100644
--- a/charts/datadog-monitors/README.md
+++ b/charts/datadog-monitors/README.md
@@ -20,8 +20,8 @@ datadog monitor alerts template
 
 | Key | Type | Default | Description |
 |-----|------|---------|-------------|
-| monitors | `map[string]interface{}` | `{"resourceName":{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}}` | List of monitors |
-| monitors.resourceName | `map[string]interface{}` | `{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":["tagname:tagvalue"],"type":"query alert"}` | Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) |
+| monitors | `map[string]interface{}` | `{"resourceName":{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":{"tagname":"tagvalue"},"type":"query alert"}}` | List of monitors |
+| monitors.resourceName | `map[string]interface{}` | `{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":{"tagname":"tagvalue"},"type":"query alert"}` | Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) |
 | monitors.resourceName.disabled | `boolean` | `true` | Optional: whether to exclude the monitor, defaults to False |
 | monitors.resourceName.message | `string` | `"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."` | Required: monitor message |
 | monitors.resourceName.name | `string` | `"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"` | Require: monitor name |
@@ -45,7 +45,7 @@ datadog monitor alerts template
 | monitors.resourceName.options.thresholds.warning | `string` | `"0.28"` | Optional: monitor warning threshold |
 | monitors.resourceName.priority | `string` | `"2"` | Optional: monitor piority |
 | monitors.resourceName.query | `string` | `"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10"` | Required: monitor query |
-| monitors.resourceName.tags | `string[]` | `["tagname:tagvalue"]` | Optional: Additional monitor tags(will be added on top of the default tags:service, team, namespace, cluster, app_env, app_group) |
+| monitors.resourceName.tags | `map[string]string` | `{"tagname":"tagvalue"}` | Optional: Additional monitor tags(will be added on top of the default tags:service, team, namespace) |
 | monitors.resourceName.type | `string` | `"query alert"` | Optional: monitor type, if not specified will default to 'query alert' Datadog monitor types to type values mapping: - anomaly: `query alert` - APM: `query alert` or `trace-analytics alert` - composite: `composite` - custom: `service check` - forecast: `query alert` - host: `service check` - integration: `query alert` or `service check` - live process: `process alert` - logs: `log alert` - metric: `query alert` - network: `service check` - outlier: `query alert` - process: `service check` - rum: `rum alert` - SLO: `slo alert` - watchdog: `event-v2 alert` - event-v2: `event-v2 alert` - audit: `audit alert` - error-tracking: `error-tracking alert` - database-monitoring: `database-monitoring alert` - network-performance: `network-performance alert` - service-discovery: `service-discovery alert` |
 | serviceName | `string` | `nil` | Optional shared pagerduty service name for monitors, will turn to a tag for alerts - if not provided, the .Release.name will be used by default |
 | team | `string` | `nil` | Optional shared pagerduty team name for monitors, will turn to a tag for alerts - if not provided, the tag will not be added |
diff --git a/charts/datadog-monitors/templates/_helpers.tpl b/charts/datadog-monitors/templates/_helpers.tpl
index 80b076b0..cfe2f7fe 100644
--- a/charts/datadog-monitors/templates/_helpers.tpl
+++ b/charts/datadog-monitors/templates/_helpers.tpl
@@ -1,7 +1,12 @@
-{{- define "datadog-monitors.shared-tags" -}}
-- "service:{{ default $.Release.Name .Values.serviceName }}"
-- "namespace:{{ .Release.Namespace }}"
-{{- with .Values.team }}
-- "team:{{ . }}"
-{{- end }}
-{{- end }}
+{{- define "datadog-monitors.tags" -}}
+{{- $root := index . 0 -}}
+{{- $tags := index . 1 -}}
+{{- $sharedtags := dict "service" (default $root.Release.Name $root.Values.serviceName) "namespace" $root.Release.Namespace -}}
+{{- with $root.Values.team -}}
+{{- $_ := set $sharedtags "team" . -}}
+{{- end -}}
+{{- $finaltags := mergeOverwrite $sharedtags $tags -}}
+{{- range $k, $v := $finaltags -}}
+- "{{ $k }}:{{ $v }}"
+{{ end -}}
+{{- end -}}
\ No newline at end of file
diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml
index 43bc61f7..2553074f 100644
--- a/charts/datadog-monitors/templates/datadog-monitors.yaml
+++ b/charts/datadog-monitors/templates/datadog-monitors.yaml
@@ -1,4 +1,4 @@
-{{- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -}}
+{{/*- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -*/}}
 {{-  range $name, $monitor := .Values.monitors }}
 {{- if not $monitor.disabled }}
 ---
@@ -15,11 +15,10 @@ spec:
   {{- with $monitor.priority }}
   priority: {{.}}
   {{- end }}
-  tags: {{ include "datadog-monitors.shared-tags" $ | nindent 4 }}
-  {{- with $monitor.tags }}{{ toYaml . | nindent 4 }}{{- end }}
+  tags: {{ include "datadog-monitors.tags" (list $ .tags) | nindent 4 }}
   options:
     locked: false
     {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }}
 {{- end }}
 {{- end }}
-{{- end }}
+{{/*- end */}}
diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml
index 290b5b9c..000ac5b7 100644
--- a/charts/datadog-monitors/values.local.yaml
+++ b/charts/datadog-monitors/values.local.yaml
@@ -14,7 +14,8 @@ monitors:
     name: "Test log alert made from DatadogMonitor"
     message: "1-2-3 testing"
     tags:
-      - "test:datadog"
+      test: datadog
+      team: data
     priority: 5
     options:
       enableLogsSample: true
diff --git a/charts/datadog-monitors/values.yaml b/charts/datadog-monitors/values.yaml
index 97e12980..0200292f 100644
--- a/charts/datadog-monitors/values.yaml
+++ b/charts/datadog-monitors/values.yaml
@@ -42,9 +42,9 @@ monitors:
     # - network-performance: `network-performance alert`
     # - service-discovery: `service-discovery alert`
     type: "query alert"
-    # -- (`string[]`) Optional: Additional monitor tags(will be added on top of the default tags:service, team, namespace, cluster, app_env, app_group)
+    # -- (`map[string]string`) Optional: Additional monitor tags(will be added on top of the default tags:service, team, namespace)
     tags:
-      - 'tagname:tagvalue'
+      tagname: tagvalue
     # -- (`map[string]interface{}`) Optional: monitor options
     options:
       # -- (`map[string]string`) Optional: monitor thresholds

From 36333486d7327c82a6ba3099e2012514ce242374 Mon Sep 17 00:00:00 2001
From: Shalom Cohen <scohen@nextdoor.com>
Date: Wed, 6 Nov 2024 11:52:20 -0500
Subject: [PATCH 11/11] Reverse the disabled flag to use enabled flag instead

---
 charts/datadog-monitors/README.md                       | 6 +++---
 charts/datadog-monitors/templates/datadog-monitors.yaml | 6 +++---
 charts/datadog-monitors/values.local.yaml               | 2 +-
 charts/datadog-monitors/values.yaml                     | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/charts/datadog-monitors/README.md b/charts/datadog-monitors/README.md
index ee802aaa..6df8abf0 100644
--- a/charts/datadog-monitors/README.md
+++ b/charts/datadog-monitors/README.md
@@ -20,9 +20,9 @@ datadog monitor alerts template
 
 | Key | Type | Default | Description |
 |-----|------|---------|-------------|
-| monitors | `map[string]interface{}` | `{"resourceName":{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":{"tagname":"tagvalue"},"type":"query alert"}}` | List of monitors |
-| monitors.resourceName | `map[string]interface{}` | `{"disabled":true,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":{"tagname":"tagvalue"},"type":"query alert"}` | Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) |
-| monitors.resourceName.disabled | `boolean` | `true` | Optional: whether to exclude the monitor, defaults to False |
+| monitors | `map[string]interface{}` | `{"resourceName":{"enabled":false,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":{"tagname":"tagvalue"},"type":"query alert"}}` | List of monitors |
+| monitors.resourceName | `map[string]interface{}` | `{"enabled":false,"message":"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.","name":"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces","options":{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}},"priority":"2","query":"change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {kube_cluster_name,kube_namespace} > 10","tags":{"tagname":"tagvalue"},"type":"query alert"}` | Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name) |
+| monitors.resourceName.enabled | `boolean` | `false` | Optional: whether to enable the monitor, defaults to true |
 | monitors.resourceName.message | `string` | `"More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."` | Required: monitor message |
 | monitors.resourceName.name | `string` | `"[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"` | Require: monitor name |
 | monitors.resourceName.options | `map[string]interface{}` | `{"evaluationDelay":300,"groupbySimpleMonitor":false,"includeTags":false,"newGroupDelay":300,"noDataTimeframe":30,"notifyBy":[],"notifyNoData":false,"renotifyInterval":0,"renotifyOccurrences":0,"renotifyStatus":[],"requireFullWindow":false,"thresholdWindows":{"alertWindow":"5m","recoveryWindow":"10m"},"thresholds":{"critical":"1","warning":"0.28"}}` | Optional: monitor options |
diff --git a/charts/datadog-monitors/templates/datadog-monitors.yaml b/charts/datadog-monitors/templates/datadog-monitors.yaml
index 2553074f..607f391e 100644
--- a/charts/datadog-monitors/templates/datadog-monitors.yaml
+++ b/charts/datadog-monitors/templates/datadog-monitors.yaml
@@ -1,6 +1,6 @@
-{{/*- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -*/}}
+{{- if .Capabilities.APIVersions.Has "datadoghq.com/v1alpha1" -}}
 {{-  range $name, $monitor := .Values.monitors }}
-{{- if not $monitor.disabled }}
+{{- if not (eq $monitor.enabled false) }}
 ---
 apiVersion: datadoghq.com/v1alpha1
 kind: DatadogMonitor
@@ -21,4 +21,4 @@ spec:
     {{- with $monitor.options }}{{ toYaml . | nindent 4 }}{{- end }}
 {{- end }}
 {{- end }}
-{{/*- end */}}
+{{- end }}
diff --git a/charts/datadog-monitors/values.local.yaml b/charts/datadog-monitors/values.local.yaml
index 000ac5b7..e71755ef 100644
--- a/charts/datadog-monitors/values.local.yaml
+++ b/charts/datadog-monitors/values.local.yaml
@@ -2,7 +2,7 @@ serviceName: eks
 team: cloudeng
 monitors:
   failed-pods:
-    disabled: true
+    enabled: false
     name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"
     message: "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs."
     priority: "2"
diff --git a/charts/datadog-monitors/values.yaml b/charts/datadog-monitors/values.yaml
index 0200292f..c0e4f37b 100644
--- a/charts/datadog-monitors/values.yaml
+++ b/charts/datadog-monitors/values.yaml
@@ -7,8 +7,8 @@ team:
 monitors:
   # -- (`map[string]interface{}`) Required: monitor resource name, Required unique monitor resource name(needed to allow value overrides and used a datadog monitor resource name)
   resourceName:
-    # -- (`boolean`) Optional: whether to exclude the monitor, defaults to False
-    disabled: true
+    # -- (`boolean`) Optional: whether to enable the monitor, defaults to true
+    enabled: false
     # -- (`string`) Require: monitor name
     name: "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces"
     # -- (`string`) Required: monitor message