From e92d5f50573cca9220f517d408bd44f571b0ae4d Mon Sep 17 00:00:00 2001 From: Matthieu Bourgain Date: Mon, 3 Feb 2025 08:54:33 +0100 Subject: [PATCH] Changes on Kubernetes monitors (#62) * Add Kubernetes monitors * typoe * replace default apiserver by heartbeat * add explanation on apiserver * Add nginx is down monitor * add vars on titles * fix metric --- caas/kubernetes/cluster/README.md | 13 +- caas/kubernetes/cluster/inputs.tf | 42 +++- .../cluster/monitors-k8s-cluster.tf | 33 ++- caas/kubernetes/cluster/outputs.tf | 5 + caas/kubernetes/ingress/vts/README.md | 10 + caas/kubernetes/ingress/vts/inputs.tf | 46 ++++- .../ingress/vts/monitors-ingress.tf | 28 +++ caas/kubernetes/ingress/vts/outputs.tf | 5 + caas/kubernetes/node/README.md | 25 +-- caas/kubernetes/node/inputs.tf | 24 --- caas/kubernetes/node/monitors-k8s-node.tf | 49 +---- caas/kubernetes/node/outputs.tf | 5 - caas/kubernetes/pod/README.md | 20 ++ caas/kubernetes/pod/inputs.tf | 87 ++++++++ caas/kubernetes/pod/monitors-k8s-pod.tf | 60 ++++++ caas/kubernetes/pod/outputs.tf | 10 + caas/kubernetes/workload/README.md | 46 +++++ caas/kubernetes/workload/inputs.tf | 192 ++++++++++++++++++ .../workload/monitors-k8s-workload.tf | 152 ++++++++++++++ caas/kubernetes/workload/outputs.tf | 25 +++ 20 files changed, 784 insertions(+), 93 deletions(-) diff --git a/caas/kubernetes/cluster/README.md b/caas/kubernetes/cluster/README.md index 0020fa2e..ba8d1172 100644 --- a/caas/kubernetes/cluster/README.md +++ b/caas/kubernetes/cluster/README.md @@ -17,7 +17,8 @@ module "datadog-monitors-caas-kubernetes-cluster" { Creates DataDog monitors with the following checks: -- Kubernetes API server does not respond +- Kubernetes API server does not respond on {{kube_cluster_name}} (disabled by default) +- Kubernetes cluster heartbeat alert on {{kube_cluster_name}} ## Requirements @@ -44,12 +45,13 @@ Creates DataDog monitors with the following checks: | Name | Type | |------|------| | [datadog_monitor.apiserver](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | +| [datadog_monitor.heartbeat](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [apiserver\_enabled](#input\_apiserver\_enabled) | Flag to enable API server monitor | `string` | `"true"` | no | +| [apiserver\_enabled](#input\_apiserver\_enabled) | Flag to enable API server monitor (do not work on some clusters, see https://docs.datadoghq.com/containers/kubernetes/control_plane/?tab=datadogoperator#ManagedServices) | `string` | `"false"` | no | | [apiserver\_extra\_tags](#input\_apiserver\_extra\_tags) | Extra tags for API server monitor | `list(string)` | `[]` | no | | [apiserver\_message](#input\_apiserver\_message) | Custom message for API server monitor | `string` | `""` | no | | [apiserver\_no\_data\_timeframe](#input\_apiserver\_no\_data\_timeframe) | Number of minutes before reporting no data | `string` | `10` | no | @@ -60,6 +62,12 @@ Creates DataDog monitors with the following checks: | [filter\_tags\_custom\_excluded](#input\_filter\_tags\_custom\_excluded) | Tags excluded for custom filtering when filter\_tags\_use\_defaults is false | `string` | `""` | no | | [filter\_tags\_separator](#input\_filter\_tags\_separator) | Set the filter tags separator (, or AND) | `string` | `","` | no | | [filter\_tags\_use\_defaults](#input\_filter\_tags\_use\_defaults) | Use default filter tags convention | `string` | `"true"` | no | +| [heartbeat\_enabled](#input\_heartbeat\_enabled) | Flag to enable heartbeat monitor | `string` | `"true"` | no | +| [heartbeat\_extra\_tags](#input\_heartbeat\_extra\_tags) | Extra tags for heartbeat monitor | `list(string)` | `[]` | no | +| [heartbeat\_message](#input\_heartbeat\_message) | Custom message for heartbeat monitor | `string` | `""` | no | +| [heartbeat\_no\_data\_timeframe](#input\_heartbeat\_no\_data\_timeframe) | Number of minutes before reporting no data | `string` | `20` | no | +| [heartbeat\_time\_aggregator](#input\_heartbeat\_time\_aggregator) | Time aggregator for heartbeat monitor | `string` | `"min"` | no | +| [heartbeat\_timeframe](#input\_heartbeat\_timeframe) | Timeframe for heartbeat monitor | `string` | `"last_30m"` | no | | [message](#input\_message) | Message sent when a monitor is triggered | `any` | n/a | yes | | [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before monitor new resource | `number` | `300` | no | | [new\_host\_delay](#input\_new\_host\_delay) | Delay in seconds before monitor new resource | `number` | `300` | no | @@ -74,6 +82,7 @@ Creates DataDog monitors with the following checks: | Name | Description | |------|-------------| | [apiserver\_id](#output\_apiserver\_id) | id for monitor apiserver | +| [heartbeat\_id](#output\_heartbeat\_id) | id for monitor heartbeat | ## Related documentation diff --git a/caas/kubernetes/cluster/inputs.tf b/caas/kubernetes/cluster/inputs.tf index 1f2249ca..46e3ba65 100644 --- a/caas/kubernetes/cluster/inputs.tf +++ b/caas/kubernetes/cluster/inputs.tf @@ -66,11 +66,11 @@ variable "apiserver_no_data_timeframe" { } # Datadog monitors variables - +## API server monitor variables variable "apiserver_enabled" { - description = "Flag to enable API server monitor" + description = "Flag to enable API server monitor (do not work on some clusters, see https://docs.datadoghq.com/containers/kubernetes/control_plane/?tab=datadogoperator#ManagedServices)" type = string - default = "true" + default = "false" } variable "apiserver_extra_tags" { @@ -91,3 +91,39 @@ variable "apiserver_threshold_warning" { default = 3 } +## Heartbeat monitor variables +variable "heartbeat_enabled" { + description = "Flag to enable heartbeat monitor" + type = string + default = "true" +} + +variable "heartbeat_message" { + description = "Custom message for heartbeat monitor" + type = string + default = "" +} + +variable "heartbeat_no_data_timeframe" { + description = "Number of minutes before reporting no data" + type = string + default = 20 +} + +variable "heartbeat_time_aggregator" { + description = "Time aggregator for heartbeat monitor" + type = string + default = "min" +} + +variable "heartbeat_timeframe" { + description = "Timeframe for heartbeat monitor" + type = string + default = "last_30m" +} + +variable "heartbeat_extra_tags" { + description = "Extra tags for heartbeat monitor" + type = list(string) + default = [] +} diff --git a/caas/kubernetes/cluster/monitors-k8s-cluster.tf b/caas/kubernetes/cluster/monitors-k8s-cluster.tf index b6c63f8d..5ee9c70d 100644 --- a/caas/kubernetes/cluster/monitors-k8s-cluster.tf +++ b/caas/kubernetes/cluster/monitors-k8s-cluster.tf @@ -1,12 +1,12 @@ resource "datadog_monitor" "apiserver" { count = var.apiserver_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes API server does not respond" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes API server does not respond on {{kube_cluster_name}}" message = coalesce(var.apiserver_message, var.message) type = "service check" query = < 1000000 +EOQ + + monitor_thresholds { + critical = 1000000 # high threshold to handle no data only + } + + new_group_delay = var.new_group_delay + notify_no_data = true + no_data_timeframe = var.heartbeat_no_data_timeframe + renotify_interval = 0 + notify_audit = false + timeout_h = var.timeout_h + include_tags = true + require_full_window = true + + tags = concat(local.common_tags, var.tags, var.heartbeat_extra_tags) +} diff --git a/caas/kubernetes/cluster/outputs.tf b/caas/kubernetes/cluster/outputs.tf index 496255af..1464508a 100644 --- a/caas/kubernetes/cluster/outputs.tf +++ b/caas/kubernetes/cluster/outputs.tf @@ -3,3 +3,8 @@ output "apiserver_id" { value = datadog_monitor.apiserver.*.id } +output "heartbeat_id" { + description = "id for monitor heartbeat" + value = datadog_monitor.heartbeat.*.id +} + diff --git a/caas/kubernetes/ingress/vts/README.md b/caas/kubernetes/ingress/vts/README.md index 6b8413d5..3d9bcf7b 100644 --- a/caas/kubernetes/ingress/vts/README.md +++ b/caas/kubernetes/ingress/vts/README.md @@ -19,6 +19,7 @@ Creates DataDog monitors with the following checks: - Nginx Ingress 4xx errors - Nginx Ingress 5xx errors +- Nginx Ingress {{kube_replica_set}} is down on {{kube_cluster_name}} ## Requirements @@ -46,6 +47,7 @@ Creates DataDog monitors with the following checks: | Name | Type | |------|------| +| [datadog_monitor.nginx_ingress_is_down](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | | [datadog_monitor.nginx_ingress_too_many_4xx](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | | [datadog_monitor.nginx_ingress_too_many_5xx](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | @@ -74,6 +76,13 @@ Creates DataDog monitors with the following checks: | [ingress\_5xx\_threshold\_warning](#input\_ingress\_5xx\_threshold\_warning) | 5xx warning threshold in percentage | `string` | `"10"` | no | | [ingress\_5xx\_time\_aggregator](#input\_ingress\_5xx\_time\_aggregator) | Monitor aggregator for Ingress 5xx errors [available values: min, max or avg] | `string` | `"min"` | no | | [ingress\_5xx\_timeframe](#input\_ingress\_5xx\_timeframe) | Monitor timeframe for Ingress 5xx errors [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_5m"` | no | +| [ingress\_down\_enabled](#input\_ingress\_down\_enabled) | Flag to enable Nginx Ingress is down monitor | `string` | `"true"` | no | +| [ingress\_down\_extra\_tags](#input\_ingress\_down\_extra\_tags) | Extra tags for Nginx Ingress is down monitor | `list(string)` | `[]` | no | +| [ingress\_down\_message](#input\_ingress\_down\_message) | Message sent when an alert is triggered | `string` | `""` | no | +| [ingress\_down\_threshold\_critical](#input\_ingress\_down\_threshold\_critical) | Nginx Ingress is down critical threshold in percentage | `number` | `0.3` | no | +| [ingress\_down\_threshold\_warning](#input\_ingress\_down\_threshold\_warning) | Nginx Ingress is down warning threshold in percentage | `number` | `0.7` | no | +| [ingress\_down\_time\_aggregator](#input\_ingress\_down\_time\_aggregator) | Monitor aggregator for Nginx Ingress is down [available values: min, max or avg] | `string` | `"avg"` | no | +| [ingress\_down\_timeframe](#input\_ingress\_down\_timeframe) | Monitor timeframe for Nginx Ingress is down [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_10m"` | no | | [message](#input\_message) | Message sent when an alert is triggered | `any` | n/a | yes | | [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before monitor new resource | `number` | `300` | no | | [new\_host\_delay](#input\_new\_host\_delay) | Delay in seconds before monitor new resource | `number` | `300` | no | @@ -87,6 +96,7 @@ Creates DataDog monitors with the following checks: | Name | Description | |------|-------------| +| [nginx\_ingress\_is\_down\_id](#output\_nginx\_ingress\_is\_down\_id) | id for monitor nginx\_ingress\_is\_down | | [nginx\_ingress\_too\_many\_4xx\_id](#output\_nginx\_ingress\_too\_many\_4xx\_id) | id for monitor nginx\_ingress\_too\_many\_4xx | | [nginx\_ingress\_too\_many\_5xx\_id](#output\_nginx\_ingress\_too\_many\_5xx\_id) | id for monitor nginx\_ingress\_too\_many\_5xx | diff --git a/caas/kubernetes/ingress/vts/inputs.tf b/caas/kubernetes/ingress/vts/inputs.tf index 19f3974a..0401454d 100644 --- a/caas/kubernetes/ingress/vts/inputs.tf +++ b/caas/kubernetes/ingress/vts/inputs.tf @@ -59,8 +59,8 @@ variable "filter_tags_separator" { default = "," } -#Ingress - +# Nginx Ingress +## Nginx Ingress 5xx errors monitor variable "ingress_5xx_enabled" { description = "Flag to enable Ingress 5xx errors monitor" type = string @@ -102,6 +102,7 @@ variable "ingress_5xx_threshold_warning" { description = "5xx warning threshold in percentage" } +## Nginx Ingress 4xx errors monitor variable "ingress_4xx_enabled" { description = "Flag to enable Ingress 4xx errors monitor" type = string @@ -148,3 +149,44 @@ variable "artificial_requests_count" { description = "Number of false requests used to mitigate false positive in case of low trafic" } +## Nginx Ingress is down monitor +variable "ingress_down_enabled" { + type = string + default = "true" + description = "Flag to enable Nginx Ingress is down monitor" +} + +variable "ingress_down_message" { + default = "" + description = "Message sent when an alert is triggered" +} + +variable "ingress_down_time_aggregator" { + type = string + default = "avg" + description = "Monitor aggregator for Nginx Ingress is down [available values: min, max or avg]" +} + +variable "ingress_down_timeframe" { + type = string + default = "last_10m" + description = "Monitor timeframe for Nginx Ingress is down [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" +} + +variable "ingress_down_threshold_critical" { + type = number + default = 0.3 + description = "Nginx Ingress is down critical threshold in percentage" +} + +variable "ingress_down_threshold_warning" { + type = number + default = 0.7 + description = "Nginx Ingress is down warning threshold in percentage" +} + +variable "ingress_down_extra_tags" { + type = list(string) + default = [] + description = "Extra tags for Nginx Ingress is down monitor" +} diff --git a/caas/kubernetes/ingress/vts/monitors-ingress.tf b/caas/kubernetes/ingress/vts/monitors-ingress.tf index 87064b5a..cbac7895 100644 --- a/caas/kubernetes/ingress/vts/monitors-ingress.tf +++ b/caas/kubernetes/ingress/vts/monitors-ingress.tf @@ -60,3 +60,31 @@ EOQ tags = concat(local.common_tags, var.tags, var.ingress_4xx_extra_tags) } +resource "datadog_monitor" "nginx_ingress_is_down" { + count = var.ingress_down_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Nginx Ingress {{kube_replica_set}} is down on {{kube_cluster_name}}" + message = coalesce(var.ingress_down_message, var.message) + type = "query alert" + + query = < ## Requirements @@ -53,7 +52,6 @@ Creates DataDog monitors with the following checks: | Name | Type | |------|------| -| [datadog_monitor.disk_out](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | | [datadog_monitor.disk_pressure](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | | [datadog_monitor.kubelet_ping](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | | [datadog_monitor.kubelet_syncloop](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | @@ -68,10 +66,6 @@ Creates DataDog monitors with the following checks: | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [disk\_out\_enabled](#input\_disk\_out\_enabled) | Flag to enable Out of disk monitor | `string` | `"true"` | no | -| [disk\_out\_extra\_tags](#input\_disk\_out\_extra\_tags) | Extra tags for Out of disk monitor | `list(string)` | `[]` | no | -| [disk\_out\_message](#input\_disk\_out\_message) | Custom message for Out of disk monitor | `string` | `""` | no | -| [disk\_out\_threshold\_warning](#input\_disk\_out\_threshold\_warning) | Out of disk monitor (warning threshold) | `string` | `3` | no | | [disk\_pressure\_enabled](#input\_disk\_pressure\_enabled) | Flag to enable Disk pressure monitor | `string` | `"true"` | no | | [disk\_pressure\_extra\_tags](#input\_disk\_pressure\_extra\_tags) | Extra tags for Disk pressure monitor | `list(string)` | `[]` | no | | [disk\_pressure\_message](#input\_disk\_pressure\_message) | Custom message for Disk pressure monitor | `string` | `""` | no | @@ -137,7 +131,6 @@ Creates DataDog monitors with the following checks: | Name | Description | |------|-------------| -| [disk\_out\_id](#output\_disk\_out\_id) | id for monitor disk\_out | | [disk\_pressure\_id](#output\_disk\_pressure\_id) | id for monitor disk\_pressure | | [kubelet\_ping\_id](#output\_kubelet\_ping\_id) | id for monitor kubelet\_ping | | [kubelet\_syncloop\_id](#output\_kubelet\_syncloop\_id) | id for monitor kubelet\_syncloop | diff --git a/caas/kubernetes/node/inputs.tf b/caas/kubernetes/node/inputs.tf index 6dc3f091..f16c1711 100644 --- a/caas/kubernetes/node/inputs.tf +++ b/caas/kubernetes/node/inputs.tf @@ -91,30 +91,6 @@ variable "disk_pressure_threshold_warning" { default = 3 } -variable "disk_out_enabled" { - description = "Flag to enable Out of disk monitor" - type = string - default = "true" -} - -variable "disk_out_extra_tags" { - description = "Extra tags for Out of disk monitor" - type = list(string) - default = [] -} - -variable "disk_out_message" { - description = "Custom message for Out of disk monitor" - type = string - default = "" -} - -variable "disk_out_threshold_warning" { - description = "Out of disk monitor (warning threshold)" - type = string - default = 3 -} - variable "memory_pressure_enabled" { description = "Flag to enable Memory pressure monitor" type = string diff --git a/caas/kubernetes/node/monitors-k8s-node.tf b/caas/kubernetes/node/monitors-k8s-node.tf index 4723a15d..d5584127 100644 --- a/caas/kubernetes/node/monitors-k8s-node.tf +++ b/caas/kubernetes/node/monitors-k8s-node.tf @@ -1,6 +1,6 @@ resource "datadog_monitor" "disk_pressure" { count = var.disk_pressure_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node Disk pressure" + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node {{kube_node}} disk pressure on {{kube_cluster_name}}" message = coalesce(var.disk_pressure_message, var.message) type = "service check" @@ -25,36 +25,9 @@ EOQ tags = concat(local.common_tags, var.tags, var.disk_pressure_extra_tags) } -resource "datadog_monitor" "disk_out" { - count = var.disk_out_enabled == "true" ? 1 : 0 - name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Node Out of disk" - message = coalesce(var.disk_out_message, var.message) - type = "service check" - - query = < ## Requirements @@ -48,6 +50,8 @@ Creates DataDog monitors with the following checks: | Name | Type | |------|------| | [datadog_monitor.error](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | +| [datadog_monitor.pod_container_killed_by_oom](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | +| [datadog_monitor.pod_frequently_restarted](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | | [datadog_monitor.pod_phase_status](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | | [datadog_monitor.terminated](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | @@ -72,6 +76,20 @@ Creates DataDog monitors with the following checks: | [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before monitor new resource | `number` | `300` | no | | [new\_host\_delay](#input\_new\_host\_delay) | Delay in seconds before monitor new resource | `number` | `300` | no | | [notify\_no\_data](#input\_notify\_no\_data) | Will raise no data alert if set to true | `bool` | `true` | no | +| [pod\_container\_killed\_by\_oom\_enabled](#input\_pod\_container\_killed\_by\_oom\_enabled) | Flag to enable Pod container killed by OOM monitor | `string` | `"true"` | no | +| [pod\_container\_killed\_by\_oom\_extra\_tags](#input\_pod\_container\_killed\_by\_oom\_extra\_tags) | Extra tags for Pod container killed by OOM monitor | `list(string)` | `[]` | no | +| [pod\_container\_killed\_by\_oom\_message](#input\_pod\_container\_killed\_by\_oom\_message) | Custom message for Pod container killed by OOM monitor | `string` | `""` | no | +| [pod\_container\_killed\_by\_oom\_threshold\_critical](#input\_pod\_container\_killed\_by\_oom\_threshold\_critical) | Pod container killed by OOM critical threshold | `number` | `5` | no | +| [pod\_container\_killed\_by\_oom\_threshold\_warning](#input\_pod\_container\_killed\_by\_oom\_threshold\_warning) | Pod container killed by OOM warning threshold | `number` | `0` | no | +| [pod\_container\_killed\_by\_oom\_time\_aggregator](#input\_pod\_container\_killed\_by\_oom\_time\_aggregator) | Monitor aggregator for Pod container killed by OOM [available values: min, max or avg] | `string` | `"avg"` | no | +| [pod\_container\_killed\_by\_oom\_timeframe](#input\_pod\_container\_killed\_by\_oom\_timeframe) | Monitor timeframe for Pod container killed by OOM [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_15m"` | no | +| [pod\_frequently\_restarted\_enabled](#input\_pod\_frequently\_restarted\_enabled) | Flag to enable Pod frequently restarted monitor | `string` | `"true"` | no | +| [pod\_frequently\_restarted\_extra\_tags](#input\_pod\_frequently\_restarted\_extra\_tags) | Extra tags for Pod frequently restarted monitor | `list(string)` | `[]` | no | +| [pod\_frequently\_restarted\_message](#input\_pod\_frequently\_restarted\_message) | Custom message for Pod frequently restarted monitor | `string` | `""` | no | +| [pod\_frequently\_restarted\_threshold\_critical](#input\_pod\_frequently\_restarted\_threshold\_critical) | Pod frequently restarted critical threshold | `number` | `10` | no | +| [pod\_frequently\_restarted\_threshold\_warning](#input\_pod\_frequently\_restarted\_threshold\_warning) | Pod frequently restarted warning threshold | `number` | `5` | no | +| [pod\_frequently\_restarted\_time\_aggregator](#input\_pod\_frequently\_restarted\_time\_aggregator) | Monitor aggregator for Pod frequently restarted [available values: min, max or avg] | `string` | `"min"` | no | +| [pod\_frequently\_restarted\_timeframe](#input\_pod\_frequently\_restarted\_timeframe) | Monitor timeframe for Pod frequently restarted [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_15m"` | no | | [pod\_group\_by](#input\_pod\_group\_by) | Select group by element on monitors (error and terminated) | `list` |
[
"kube_namespace",
"pod_name",
"reason",
"kube_cluster_name"
]
| no | | [pod\_phase\_status\_enabled](#input\_pod\_phase\_status\_enabled) | Flag to enable Pod phase status monitor | `string` | `"true"` | no | | [pod\_phase\_status\_extra\_tags](#input\_pod\_phase\_status\_extra\_tags) | Extra tags for Pod phase status monitor | `list(string)` | `[]` | no | @@ -96,6 +114,8 @@ Creates DataDog monitors with the following checks: | Name | Description | |------|-------------| | [error\_id](#output\_error\_id) | id for monitor error | +| [pod\_container\_killed\_by\_oom\_id](#output\_pod\_container\_killed\_by\_oom\_id) | id for monitor pod\_container\_killed\_by\_oom | +| [pod\_frequently\_restarted\_id](#output\_pod\_frequently\_restarted\_id) | id for monitor pod\_frequently\_restarted | | [pod\_phase\_status\_id](#output\_pod\_phase\_status\_id) | id for monitor pod\_phase\_status | | [terminated\_id](#output\_terminated\_id) | id for monitor terminated | diff --git a/caas/kubernetes/pod/inputs.tf b/caas/kubernetes/pod/inputs.tf index 548163b5..1fc01ca9 100644 --- a/caas/kubernetes/pod/inputs.tf +++ b/caas/kubernetes/pod/inputs.tf @@ -171,6 +171,93 @@ variable "terminated_threshold_warning" { description = "terminated warning threshold" } +# Pod container killed by OOM +variable "pod_container_killed_by_oom_enabled" { + description = "Flag to enable Pod container killed by OOM monitor" + type = string + default = "true" +} + +variable "pod_container_killed_by_oom_message" { + description = "Custom message for Pod container killed by OOM monitor" + type = string + default = "" +} + +variable "pod_container_killed_by_oom_time_aggregator" { + description = "Monitor aggregator for Pod container killed by OOM [available values: min, max or avg]" + type = string + default = "avg" +} + +variable "pod_container_killed_by_oom_timeframe" { + description = "Monitor timeframe for Pod container killed by OOM [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_15m" +} + +variable "pod_container_killed_by_oom_threshold_warning" { + description = "Pod container killed by OOM warning threshold" + type = number + default = 0 +} + +variable "pod_container_killed_by_oom_threshold_critical" { + description = "Pod container killed by OOM critical threshold" + type = number + default = 5 +} + +variable "pod_container_killed_by_oom_extra_tags" { + description = "Extra tags for Pod container killed by OOM monitor" + type = list(string) + default = [] +} + +# Pod frequently restarted +variable "pod_frequently_restarted_enabled" { + description = "Flag to enable Pod frequently restarted monitor" + type = string + default = "true" +} + +variable "pod_frequently_restarted_message" { + description = "Custom message for Pod frequently restarted monitor" + type = string + default = "" +} + +variable "pod_frequently_restarted_time_aggregator" { + description = "Monitor aggregator for Pod frequently restarted [available values: min, max or avg]" + type = string + default = "min" +} + +variable "pod_frequently_restarted_timeframe" { + description = "Monitor timeframe for Pod frequently restarted [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_15m" +} + +variable "pod_frequently_restarted_threshold_warning" { + description = "Pod frequently restarted warning threshold" + type = number + default = 5 +} + +variable "pod_frequently_restarted_threshold_critical" { + description = "Pod frequently restarted critical threshold" + type = number + default = 10 +} + +variable "pod_frequently_restarted_extra_tags" { + description = "Extra tags for Pod frequently restarted monitor" + type = list(string) + default = [] +} + +# General filter tags variable "pod_group_by" { default = ["kube_namespace", "pod_name", "reason", "kube_cluster_name"] description = "Select group by element on monitors (error and terminated)" diff --git a/caas/kubernetes/pod/monitors-k8s-pod.tf b/caas/kubernetes/pod/monitors-k8s-pod.tf index 3a1c9a8b..960d0bd3 100644 --- a/caas/kubernetes/pod/monitors-k8s-pod.tf +++ b/caas/kubernetes/pod/monitors-k8s-pod.tf @@ -87,3 +87,63 @@ EOQ tags = concat(local.common_tags, var.tags, var.terminated_extra_tags) } +resource "datadog_monitor" "pod_container_killed_by_oom" { + count = var.pod_container_killed_by_oom_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Pod {{pod_name}} container {{kube_container_name}} killed by OOM on {{kube_cluster_name}}" + message = coalesce(var.pod_container_killed_by_oom_message, var.message) + type = "query alert" + + query = < ${var.pod_container_killed_by_oom_threshold_critical} +EOQ + + monitor_thresholds { + warning = var.pod_container_killed_by_oom_threshold_warning + critical = var.pod_container_killed_by_oom_threshold_critical + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + new_group_delay = var.new_group_delay + notify_no_data = false + renotify_interval = 0 + notify_audit = false + timeout_h = var.timeout_h + include_tags = true + require_full_window = true + + tags = concat(local.common_tags, var.tags, var.pod_container_killed_by_oom_extra_tags) +} + +resource "datadog_monitor" "pod_frequently_restarted" { + count = var.pod_frequently_restarted_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Pods in {{kube_replica_set}} frequently restarted on {{kube_cluster_name}}" + message = coalesce(var.pod_frequently_restarted_message, var.message) + type = "metric alert" + + query = <= ${var.pod_frequently_restarted_threshold_critical} +EOQ + + monitor_thresholds { + warning = var.pod_frequently_restarted_threshold_warning + critical = var.pod_frequently_restarted_threshold_critical + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + new_group_delay = var.new_group_delay + + notify_no_data = false + renotify_interval = 0 + notify_audit = false + timeout_h = var.timeout_h + include_tags = true + require_full_window = true + + tags = concat(local.common_tags, var.tags, var.pod_frequently_restarted_extra_tags) +} diff --git a/caas/kubernetes/pod/outputs.tf b/caas/kubernetes/pod/outputs.tf index 1e715800..dede2db1 100644 --- a/caas/kubernetes/pod/outputs.tf +++ b/caas/kubernetes/pod/outputs.tf @@ -3,6 +3,16 @@ output "error_id" { value = datadog_monitor.error.*.id } +output "pod_container_killed_by_oom_id" { + description = "id for monitor pod_container_killed_by_oom" + value = datadog_monitor.pod_container_killed_by_oom.*.id +} + +output "pod_frequently_restarted_id" { + description = "id for monitor pod_frequently_restarted" + value = datadog_monitor.pod_frequently_restarted.*.id +} + output "pod_phase_status_id" { description = "id for monitor pod_phase_status" value = datadog_monitor.pod_phase_status.*.id diff --git a/caas/kubernetes/workload/README.md b/caas/kubernetes/workload/README.md index 3f7650c1..c83254b5 100644 --- a/caas/kubernetes/workload/README.md +++ b/caas/kubernetes/workload/README.md @@ -20,8 +20,13 @@ Creates DataDog monitors with the following checks: - Kubernetes Available replicas - Kubernetes cronjob scheduling failed - Kubernetes Current replicas +- Kubernetes DaemonSet {{kube_daemon_set}} not ready on {{kube_cluster_name}} +- Kubernetes Deployment {{kube_deployment}} replica too low on {{kube_cluster_name}} +- Kubernetes HPA cannot scale out further {{horizontalpodautoscaler}} on {{kube_cluster_name}} - Kubernetes job failed +- Kubernetes Pod Disruption Budget {{poddisruptionbudget}} not respected on {{kube_cluster_name}} - Kubernetes Ready replicas +- Kubernetes StatefulSet {{kube_stateful_set}} not ready on {{kube_cluster_name}} ## Requirements @@ -48,10 +53,15 @@ Creates DataDog monitors with the following checks: | Name | Type | |------|------| | [datadog_monitor.cronjob](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | +| [datadog_monitor.daemonset_pods_not_ready](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | +| [datadog_monitor.deployments_replica_too_low](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | +| [datadog_monitor.hpa_cannot_scaleup_further](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | | [datadog_monitor.job](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | +| [datadog_monitor.pod_disruption_budget_not_respected](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | | [datadog_monitor.replica_available](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | | [datadog_monitor.replica_current](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | | [datadog_monitor.replica_ready](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | +| [datadog_monitor.statefulset_pods_not_ready](https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor) | resource | ## Inputs @@ -62,13 +72,32 @@ Creates DataDog monitors with the following checks: | [cronjob\_message](#input\_cronjob\_message) | Custom message for Cronjob monitor | `string` | `""` | no | | [cronjob\_threshold\_warning](#input\_cronjob\_threshold\_warning) | Cronjob monitor (warning threshold) | `string` | `3` | no | | [cronjobfailed\_group\_by](#input\_cronjobfailed\_group\_by) | n/a | `list` |
[
"kube_cronjob"
]
| no | +| [daemonset\_pods\_not\_ready\_enabled](#input\_daemonset\_pods\_not\_ready\_enabled) | Flag to enable DaemonSet pods not ready monitor | `string` | `"true"` | no | +| [daemonset\_pods\_not\_ready\_extra\_tags](#input\_daemonset\_pods\_not\_ready\_extra\_tags) | Extra tags for DaemonSet pods not ready monitor | `list(string)` | `[]` | no | +| [daemonset\_pods\_not\_ready\_message](#input\_daemonset\_pods\_not\_ready\_message) | Custom message for DaemonSet pods not ready monitor | `string` | `""` | no | +| [daemonset\_pods\_not\_ready\_threshold\_critical](#input\_daemonset\_pods\_not\_ready\_threshold\_critical) | DaemonSet pods not ready critical threshold | `number` | `1` | no | +| [daemonset\_pods\_not\_ready\_time\_aggregator](#input\_daemonset\_pods\_not\_ready\_time\_aggregator) | Monitor aggregator for DaemonSet pods not ready [available values: min, max or avg] | `string` | `"max"` | no | +| [daemonset\_pods\_not\_ready\_timeframe](#input\_daemonset\_pods\_not\_ready\_timeframe) | Monitor timeframe for DaemonSet pods not ready [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_30m"` | no | | [deployment\_group\_by](#input\_deployment\_group\_by) | Select group by element on deployment monitors | `list` |
[
"kube_namespace",
"kube_deployment",
"kube_cluster_name"
]
| no | +| [deployments\_replica\_too\_low\_enabled](#input\_deployments\_replica\_too\_low\_enabled) | Flag to enable Deployment replica too low monitor | `string` | `"true"` | no | +| [deployments\_replica\_too\_low\_extra\_tags](#input\_deployments\_replica\_too\_low\_extra\_tags) | Extra tags for Deployment replica too low monitor | `list(string)` | `[]` | no | +| [deployments\_replica\_too\_low\_message](#input\_deployments\_replica\_too\_low\_message) | Custom message for Deployment replica too low monitor | `string` | `""` | no | +| [deployments\_replica\_too\_low\_threshold\_critical](#input\_deployments\_replica\_too\_low\_threshold\_critical) | Deployment replica too low critical threshold | `number` | `0` | no | +| [deployments\_replica\_too\_low\_time\_aggregator](#input\_deployments\_replica\_too\_low\_time\_aggregator) | Monitor aggregator for Deployment replica too low [available values: min, max or avg] | `string` | `"max"` | no | +| [deployments\_replica\_too\_low\_timeframe](#input\_deployments\_replica\_too\_low\_timeframe) | Monitor timeframe for Deployment replica too low [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_15m"` | no | | [environment](#input\_environment) | Architecture Environment | `string` | n/a | yes | | [evaluation\_delay](#input\_evaluation\_delay) | Delay in seconds for the metric evaluation | `number` | `15` | no | | [filter\_tags\_custom](#input\_filter\_tags\_custom) | Tags used for custom filtering when filter\_tags\_use\_defaults is false | `string` | `"*"` | no | | [filter\_tags\_custom\_excluded](#input\_filter\_tags\_custom\_excluded) | Tags excluded for custom filtering when filter\_tags\_use\_defaults is false | `string` | `""` | no | | [filter\_tags\_separator](#input\_filter\_tags\_separator) | Set the filter tags separator (, or AND) | `string` | `","` | no | | [filter\_tags\_use\_defaults](#input\_filter\_tags\_use\_defaults) | Use default filter tags convention | `string` | `"true"` | no | +| [hpa\_cannot\_scaleup\_further\_enabled](#input\_hpa\_cannot\_scaleup\_further\_enabled) | Flag to enable HPA cannot scale up further monitor | `string` | `"true"` | no | +| [hpa\_cannot\_scaleup\_further\_extra\_tags](#input\_hpa\_cannot\_scaleup\_further\_extra\_tags) | Extra tags for HPA cannot scale up further monitor | `list(string)` | `[]` | no | +| [hpa\_cannot\_scaleup\_further\_message](#input\_hpa\_cannot\_scaleup\_further\_message) | Custom message for HPA cannot scale up further monitor | `string` | `""` | no | +| [hpa\_cannot\_scaleup\_further\_threshold\_critical](#input\_hpa\_cannot\_scaleup\_further\_threshold\_critical) | HPA cannot scale up further critical threshold | `number` | `100` | no | +| [hpa\_cannot\_scaleup\_further\_threshold\_warning](#input\_hpa\_cannot\_scaleup\_further\_threshold\_warning) | HPA cannot scale up further warning threshold | `number` | `90` | no | +| [hpa\_cannot\_scaleup\_further\_time\_aggregator](#input\_hpa\_cannot\_scaleup\_further\_time\_aggregator) | Monitor aggregator for HPA cannot scale up further [available values: min, max or avg] | `string` | `"avg"` | no | +| [hpa\_cannot\_scaleup\_further\_timeframe](#input\_hpa\_cannot\_scaleup\_further\_timeframe) | Monitor timeframe for HPA cannot scale up further [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_30m"` | no | | [job\_enabled](#input\_job\_enabled) | Flag to enable Job monitor | `string` | `"true"` | no | | [job\_extra\_tags](#input\_job\_extra\_tags) | Extra tags for Job monitor | `list(string)` | `[]` | no | | [job\_message](#input\_job\_message) | Custom message for Job monitor | `string` | `""` | no | @@ -78,6 +107,12 @@ Creates DataDog monitors with the following checks: | [new\_group\_delay](#input\_new\_group\_delay) | Delay in seconds before monitor new resource | `number` | `300` | no | | [new\_host\_delay](#input\_new\_host\_delay) | Delay in seconds before monitor new resource | `number` | `300` | no | | [notify\_no\_data](#input\_notify\_no\_data) | Will raise no data alert if set to true | `bool` | `true` | no | +| [pod\_disruption\_budget\_not\_respected\_enabled](#input\_pod\_disruption\_budget\_not\_respected\_enabled) | Flag to enable Pod Disruption Budget not respected monitor | `string` | `"true"` | no | +| [pod\_disruption\_budget\_not\_respected\_extra\_tags](#input\_pod\_disruption\_budget\_not\_respected\_extra\_tags) | Extra tags for Pod Disruption Budget not respected monitor | `list(string)` | `[]` | no | +| [pod\_disruption\_budget\_not\_respected\_message](#input\_pod\_disruption\_budget\_not\_respected\_message) | Custom message for Pod Disruption Budget not respected monitor | `string` | `""` | no | +| [pod\_disruption\_budget\_not\_respected\_threshold\_critical](#input\_pod\_disruption\_budget\_not\_respected\_threshold\_critical) | Pod Disruption Budget not respected critical threshold | `number` | `0` | no | +| [pod\_disruption\_budget\_not\_respected\_time\_aggregator](#input\_pod\_disruption\_budget\_not\_respected\_time\_aggregator) | Monitor aggregator for Pod Disruption Budget not respected [available values: min, max or avg] | `string` | `"max"` | no | +| [pod\_disruption\_budget\_not\_respected\_timeframe](#input\_pod\_disruption\_budget\_not\_respected\_timeframe) | Monitor timeframe for Pod Disruption Budget not respected [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_15m"` | no | | [prefix\_slug](#input\_prefix\_slug) | Prefix string to prepend between brackets on every monitors names | `string` | `""` | no | | [replica\_available\_enabled](#input\_replica\_available\_enabled) | Flag to enable Available replica monitor | `string` | `"true"` | no | | [replica\_available\_extra\_tags](#input\_replica\_available\_extra\_tags) | Extra tags for Available replicamonitor | `list(string)` | `[]` | no | @@ -98,6 +133,12 @@ Creates DataDog monitors with the following checks: | [replica\_ready\_threshold\_critical](#input\_replica\_ready\_threshold\_critical) | Ready replica critical threshold | `number` | `1` | no | | [replica\_ready\_time\_aggregator](#input\_replica\_ready\_time\_aggregator) | Monitor aggregator for Ready replica [available values: min, max or avg] | `string` | `"max"` | no | | [replica\_ready\_timeframe](#input\_replica\_ready\_timeframe) | Monitor timeframe for Ready replica [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_5m"` | no | +| [statefulset\_pods\_not\_ready\_enabled](#input\_statefulset\_pods\_not\_ready\_enabled) | Flag to enable StatefulSet pods not ready monitor | `string` | `"true"` | no | +| [statefulset\_pods\_not\_ready\_extra\_tags](#input\_statefulset\_pods\_not\_ready\_extra\_tags) | Extra tags for StatefulSet pods not ready monitor | `list(string)` | `[]` | no | +| [statefulset\_pods\_not\_ready\_message](#input\_statefulset\_pods\_not\_ready\_message) | Custom message for StatefulSet pods not ready monitor | `string` | `""` | no | +| [statefulset\_pods\_not\_ready\_threshold\_critical](#input\_statefulset\_pods\_not\_ready\_threshold\_critical) | StatefulSet pods not ready critical threshold | `number` | `100` | no | +| [statefulset\_pods\_not\_ready\_time\_aggregator](#input\_statefulset\_pods\_not\_ready\_time\_aggregator) | Monitor aggregator for StatefulSet pods not ready [available values: min, max or avg] | `string` | `"max"` | no | +| [statefulset\_pods\_not\_ready\_timeframe](#input\_statefulset\_pods\_not\_ready\_timeframe) | Monitor timeframe for StatefulSet pods not ready [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`] | `string` | `"last_15m"` | no | | [tags](#input\_tags) | Global variables | `list(string)` |
[
"type:caas",
"provider:kubernetes",
"resource:kubernetes-workload"
]
| no | | [team](#input\_team) | n/a | `string` | `"claranet"` | no | | [timeout\_h](#input\_timeout\_h) | Default auto-resolving state (in hours) | `number` | `0` | no | @@ -107,10 +148,15 @@ Creates DataDog monitors with the following checks: | Name | Description | |------|-------------| | [cronjob\_id](#output\_cronjob\_id) | id for monitor cronjob | +| [daemonset\_pods\_not\_ready\_id](#output\_daemonset\_pods\_not\_ready\_id) | id for monitor daemonset\_pods\_not\_ready | +| [deployments\_replica\_too\_low\_id](#output\_deployments\_replica\_too\_low\_id) | id for monitor deployments\_replica\_too\_low | +| [hpa\_cannot\_scaleup\_further\_id](#output\_hpa\_cannot\_scaleup\_further\_id) | id for monitor hpa\_cannot\_scaleup\_further | | [job\_id](#output\_job\_id) | id for monitor job | +| [pod\_disruption\_budget\_not\_respected\_id](#output\_pod\_disruption\_budget\_not\_respected\_id) | id for monitor pod\_disruption\_budget\_not\_respected | | [replica\_available\_id](#output\_replica\_available\_id) | id for monitor replica\_available | | [replica\_current\_id](#output\_replica\_current\_id) | id for monitor replica\_current | | [replica\_ready\_id](#output\_replica\_ready\_id) | id for monitor replica\_ready | +| [statefulset\_pods\_not\_ready\_id](#output\_statefulset\_pods\_not\_ready\_id) | id for monitor statefulset\_pods\_not\_ready | ## Related documentation diff --git a/caas/kubernetes/workload/inputs.tf b/caas/kubernetes/workload/inputs.tf index 338cb9e3..1883aedd 100644 --- a/caas/kubernetes/workload/inputs.tf +++ b/caas/kubernetes/workload/inputs.tf @@ -214,6 +214,198 @@ variable "replica_current_threshold_critical" { description = "Current replica critical threshold" } +# DaemonSet not ready +variable "daemonset_pods_not_ready_enabled" { + description = "Flag to enable DaemonSet pods not ready monitor" + type = string + default = "true" +} + +variable "daemonset_pods_not_ready_message" { + description = "Custom message for DaemonSet pods not ready monitor" + type = string + default = "" +} + +variable "daemonset_pods_not_ready_time_aggregator" { + description = "Monitor aggregator for DaemonSet pods not ready [available values: min, max or avg]" + type = string + default = "max" +} + +variable "daemonset_pods_not_ready_timeframe" { + description = "Monitor timeframe for DaemonSet pods not ready [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_30m" +} + +variable "daemonset_pods_not_ready_threshold_critical" { + description = "DaemonSet pods not ready critical threshold" + type = number + default = 1 +} + +variable "daemonset_pods_not_ready_extra_tags" { + description = "Extra tags for DaemonSet pods not ready monitor" + type = list(string) + default = [] +} + +# StatefulSet not ready +variable "statefulset_pods_not_ready_enabled" { + description = "Flag to enable StatefulSet pods not ready monitor" + type = string + default = "true" +} + +variable "statefulset_pods_not_ready_message" { + description = "Custom message for StatefulSet pods not ready monitor" + type = string + default = "" +} + +variable "statefulset_pods_not_ready_time_aggregator" { + description = "Monitor aggregator for StatefulSet pods not ready [available values: min, max or avg]" + type = string + default = "max" +} + +variable "statefulset_pods_not_ready_timeframe" { + description = "Monitor timeframe for StatefulSet pods not ready [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_15m" +} + +variable "statefulset_pods_not_ready_threshold_critical" { + description = "StatefulSet pods not ready critical threshold" + type = number + default = 100 +} + +variable "statefulset_pods_not_ready_extra_tags" { + description = "Extra tags for StatefulSet pods not ready monitor" + type = list(string) + default = [] +} + +# Deployments replica too low +variable "deployments_replica_too_low_enabled" { + description = "Flag to enable Deployment replica too low monitor" + type = string + default = "true" +} + +variable "deployments_replica_too_low_message" { + description = "Custom message for Deployment replica too low monitor" + type = string + default = "" +} + +variable "deployments_replica_too_low_time_aggregator" { + description = "Monitor aggregator for Deployment replica too low [available values: min, max or avg]" + type = string + default = "max" +} + +variable "deployments_replica_too_low_timeframe" { + description = "Monitor timeframe for Deployment replica too low [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_15m" +} + +variable "deployments_replica_too_low_threshold_critical" { + description = "Deployment replica too low critical threshold" + type = number + default = 0 +} + +variable "deployments_replica_too_low_extra_tags" { + description = "Extra tags for Deployment replica too low monitor" + type = list(string) + default = [] +} + +# HPA cannot scale up further +variable "hpa_cannot_scaleup_further_enabled" { + description = "Flag to enable HPA cannot scale up further monitor" + type = string + default = "true" +} + +variable "hpa_cannot_scaleup_further_message" { + description = "Custom message for HPA cannot scale up further monitor" + type = string + default = "" +} + +variable "hpa_cannot_scaleup_further_time_aggregator" { + description = "Monitor aggregator for HPA cannot scale up further [available values: min, max or avg]" + type = string + default = "avg" +} + +variable "hpa_cannot_scaleup_further_timeframe" { + description = "Monitor timeframe for HPA cannot scale up further [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_30m" +} + +variable "hpa_cannot_scaleup_further_threshold_critical" { + description = "HPA cannot scale up further critical threshold" + type = number + default = 100 +} + +variable "hpa_cannot_scaleup_further_threshold_warning" { + description = "HPA cannot scale up further warning threshold" + type = number + default = 90 +} + +variable "hpa_cannot_scaleup_further_extra_tags" { + description = "Extra tags for HPA cannot scale up further monitor" + type = list(string) + default = [] +} + +# Pod Disruption Budget not respected +variable "pod_disruption_budget_not_respected_enabled" { + description = "Flag to enable Pod Disruption Budget not respected monitor" + type = string + default = "true" +} + +variable "pod_disruption_budget_not_respected_message" { + description = "Custom message for Pod Disruption Budget not respected monitor" + type = string + default = "" +} + +variable "pod_disruption_budget_not_respected_time_aggregator" { + description = "Monitor aggregator for Pod Disruption Budget not respected [available values: min, max or avg]" + type = string + default = "max" +} + +variable "pod_disruption_budget_not_respected_timeframe" { + description = "Monitor timeframe for Pod Disruption Budget not respected [available values: `last_#m` (1, 5, 10, 15, or 30), `last_#h` (1, 2, or 4), or `last_1d`]" + type = string + default = "last_15m" +} + +variable "pod_disruption_budget_not_respected_threshold_critical" { + description = "Pod Disruption Budget not respected critical threshold" + type = number + default = 0 +} + +variable "pod_disruption_budget_not_respected_extra_tags" { + description = "Extra tags for Pod Disruption Budget not respected monitor" + type = list(string) + default = [] +} + +# General filter tags variable "replica_group_by" { default = ["kube_namespace", "kube_replica_set", "kube_cluster_name"] description = "Select group by element on replicaset monitors" diff --git a/caas/kubernetes/workload/monitors-k8s-workload.tf b/caas/kubernetes/workload/monitors-k8s-workload.tf index 7cceb884..3e58f75f 100644 --- a/caas/kubernetes/workload/monitors-k8s-workload.tf +++ b/caas/kubernetes/workload/monitors-k8s-workload.tf @@ -142,3 +142,155 @@ EOQ tags = concat(local.common_tags, var.tags, var.replica_current_extra_tags) } +resource "datadog_monitor" "deployments_replica_too_low" { + count = var.deployments_replica_too_low_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Deployment {{kube_deployment}} replica too low on {{kube_cluster_name}}" + message = coalesce(var.deployments_replica_too_low_message, var.message) + type = "query alert" + + query = < ${var.daemonset_pods_not_ready_threshold_critical} +EOQ + + monitor_thresholds { + critical = var.daemonset_pods_not_ready_threshold_critical + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + new_group_delay = var.new_group_delay + notify_no_data = false + renotify_interval = 0 + notify_audit = false + timeout_h = var.timeout_h + include_tags = true + require_full_window = true + + tags = concat(local.common_tags, var.tags, var.daemonset_pods_not_ready_extra_tags) +} + +resource "datadog_monitor" "statefulset_pods_not_ready" { + count = var.statefulset_pods_not_ready_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes StatefulSet {{kube_stateful_set}} not ready on {{kube_cluster_name}}" + message = coalesce(var.statefulset_pods_not_ready_message, var.message) + type = "query alert" + + query = <= ${var.hpa_cannot_scaleup_further_threshold_critical} +EOQ + + monitor_thresholds { + critical = var.hpa_cannot_scaleup_further_threshold_critical + warning = var.hpa_cannot_scaleup_further_threshold_warning + } + + evaluation_delay = var.evaluation_delay + new_host_delay = var.new_host_delay + new_group_delay = var.new_group_delay + notify_no_data = false + renotify_interval = 0 + notify_audit = false + timeout_h = var.timeout_h + include_tags = true + require_full_window = true + + tags = concat(local.common_tags, var.tags, var.hpa_cannot_scaleup_further_extra_tags) +} + +resource "datadog_monitor" "pod_disruption_budget_not_respected" { + count = var.pod_disruption_budget_not_respected_enabled == "true" ? 1 : 0 + name = "${var.prefix_slug == "" ? "" : "[${var.prefix_slug}]"}[${var.environment}] Kubernetes Pod Disruption Budget {{poddisruptionbudget}} not respected on {{kube_cluster_name}}" + message = coalesce(var.pod_disruption_budget_not_respected_message, var.message) + type = "query alert" + + query = <