Merge pull request #369 from communitiesuk/alarm-descriptions

Add more detailed descriptions to CloudWatch alarms
communitiesuk · Dec 20, 2023 · 39ccd75 · 39ccd75
2 parents fb4c2ec + 4eebe8d
commit 39ccd75
Show file tree

Hide file tree

Showing 9 changed files with 138 additions and 45 deletions.
diff --git a/terraform/modules/active_directory/ldap_nlb.tf b/terraform/modules/active_directory/ldap_nlb.tf
@@ -138,7 +138,12 @@ resource "aws_cloudwatch_metric_alarm" "ldap_lb_healthy_count_low" {
   threshold           = 0
   evaluation_periods  = 1
 
-  alarm_description  = "The Active Directory domain controller in use is unhealthy"
+  alarm_description  = <<EOF
+The Active Directory Domain Controller in use is unhealthy.
+This can prevent logins and severely affect Delta + CPM.
+We can swap to using the other DC by updating the ${aws_lb_target_group.ldap.name} and ${aws_lb_target_group.ldaps.name} target groups (we do not leave both in the target group long term as it can cause errors updating users).
+The Domain Controllers are managed by AWS Directory Service, we have very limited visibility so if this does not resolve itself we will likely need to raise a ticket with AWS support.
+  EOF
   alarm_actions      = [var.alarms_sns_topic_arn]
   ok_actions         = [var.alarms_sns_topic_arn]
   treat_missing_data = "breaching"

diff --git a/terraform/modules/cloudfront_alb_monitoring_instance/alb_alarms.tf b/terraform/modules/cloudfront_alb_monitoring_instance/alb_alarms.tf
@@ -5,7 +5,11 @@ resource "aws_cloudwatch_metric_alarm" "alb_target_server_error_rate_alarm" {
 
   threshold = var.alb_target_server_error_rate_alarm_threshold_percent
 
-  alarm_description         = "High ALB target 5xx error rate"
+  alarm_description         = <<EOF
+High ALB target 5xx error rate for the ${var.prefix} app.
+This means the load balancer is receiving a large number of 5xx responses from the application servers and may indicate a problem with the application.
+Investigate the application's logs.
+  EOF
   alarm_actions             = [var.alarms_sns_topic_arn]
   ok_actions                = [var.alarms_sns_topic_arn]
   insufficient_data_actions = [var.alarms_sns_topic_arn]
@@ -47,7 +51,11 @@ resource "aws_cloudwatch_metric_alarm" "alb_target_client_error_rate_alarm" {
 
   threshold = var.alb_target_client_error_rate_alarm_threshold_percent
 
-  alarm_description         = "High ALB target 4xx error rate"
+  alarm_description         = <<EOF
+High ALB target 4xx error rate for the ${var.prefix} app.
+This means the load balancer is receiving a large number of 4xx responses from the application servers and may indicate a problem with the application.
+Investigate the application's logs.
+  EOF
   alarm_actions             = [var.alarms_sns_topic_arn]
   ok_actions                = [var.alarms_sns_topic_arn]
   insufficient_data_actions = [var.alarms_sns_topic_arn]

diff --git a/terraform/modules/cloudfront_alb_monitoring_instance/cloudfront_alarms.tf b/terraform/modules/cloudfront_alb_monitoring_instance/cloudfront_alarms.tf
@@ -6,10 +6,6 @@ provider "aws" {
   region = "us-east-1"
 }
 
-locals {
-  alarm_description_template = "Average distribution %v %v last %d minutes"
-}
-
 # We need to enable enhanced monitoring to get 4xx, 5xx, OriginLatency & CacheHitRate (+ other metrics)
 resource "aws_cloudfront_monitoring_subscription" "main" {
   distribution_id = var.cloudfront_distribution_id
@@ -33,7 +29,11 @@ resource "aws_cloudwatch_metric_alarm" "client_error_rate_alarm" {
   threshold          = var.cloudfront_client_error_rate_alarm_threshold_percent
   treat_missing_data = "notBreaching" # Data is missing if there are no requests
 
-  alarm_description = format(local.alarm_description_template, "Error Rate", "High", var.cloudfront_metric_period_seconds * 2 / 60)
+  alarm_description = <<EOF
+The ${var.prefix} CloudFront distribution is returning a large number of 4xx errors to users.
+This usually indicates the application itself is returning 4xx errors, but could be due to CloudFront rejecting the requests for another reason.
+Look at the application/ALB metrics first, then investigate the application itself or the WAF logs in CloudWatch/CloudFront logs in S3 as appropriate.
+  EOF
   alarm_actions     = [var.alarms_sns_topic_global_arn]
   ok_actions        = [var.alarms_sns_topic_global_arn]
 
@@ -83,7 +83,11 @@ resource "aws_cloudwatch_metric_alarm" "server_error_rate_alarm" {
   threshold          = var.cloudfront_server_error_rate_alarm_threshold_percent
   treat_missing_data = "notBreaching" # Data is missing if there are no requests
 
-  alarm_description = format(local.alarm_description_template, "Error Rate", "High", var.cloudfront_metric_period_seconds * 2 / 60)
+  alarm_description = <<EOF
+The ${var.prefix} CloudFront distribution is returning a large number of 5xx errors to users.
+This usually indicates the application itself is returning 5xx errors, but could be due to CloudFront rejecting the requests for another reason, or not being able to reach the load balancer.
+Look at the application/ALB metrics first to determine whether the issue is with the application or CloudFront, then investigate the application itself or the CloudFront logs in S3 as appropriate.
+  EOF
   alarm_actions     = [var.alarms_sns_topic_global_arn]
   ok_actions        = [var.alarms_sns_topic_global_arn]
 
@@ -137,7 +141,10 @@ resource "aws_cloudwatch_metric_alarm" "origin_latency_high_alarm" {
   threshold          = var.cloudfront_average_origin_latency_high_alarm_threshold_ms
   treat_missing_data = "notBreaching" # Data is missing if there are no requests
 
-  alarm_description = format(local.alarm_description_template, "Origin Latency", "High", var.cloudfront_metric_period_seconds * 2 / 60)
+  alarm_description = <<EOF
+The ${var.prefix} CloudFront distribution is reporting that requests to its origin (the ALB behind it) are taking longer than expected on average.
+This can be a false alarm if only a small number of users are using the service, so it can be ignored outside of business hours.
+  EOF
   alarm_actions     = [var.alarms_sns_topic_global_arn]
   ok_actions        = [var.alarms_sns_topic_global_arn]
 
@@ -162,7 +169,10 @@ resource "aws_cloudwatch_metric_alarm" "origin_latency_p90_high_alarm" {
   threshold          = var.cloudfront_p90_origin_latency_high_alarm_threshold_ms
   treat_missing_data = "notBreaching" # Data is missing if there are no requests
 
-  alarm_description = format(local.alarm_description_template, "Origin Latency P90", "High", var.cloudfront_metric_period_seconds * 2 / 60)
+  alarm_description = <<EOF
+The ${var.prefix} CloudFront distribution is reporting that requests to its origin (the ALB behind it) are taking longer than expected (90th percentile > ${var.cloudfront_p90_origin_latency_high_alarm_threshold_ms}ms).
+This can be a false alarm if only a small number of users are using the service, so it can be ignored outside of business hours.
+  EOF
   alarm_actions     = [var.alarms_sns_topic_global_arn]
   ok_actions        = [var.alarms_sns_topic_global_arn]
 
@@ -184,7 +194,7 @@ resource "aws_cloudwatch_metric_alarm" "ddos_attack" {
   period              = "60"
   statistic           = "Average"
   threshold           = "0"
-  alarm_description   = "Triggers when AWS Shield Advanced detects a DDoS attack"
+  alarm_description   = "Triggers when AWS Shield Advanced detects a DDoS attack. Escalate immediately."
   treat_missing_data  = "notBreaching"
   alarm_actions       = [var.security_sns_topic_global_arn]
   ok_actions          = [var.security_sns_topic_global_arn]

diff --git a/terraform/modules/jaspersoft/monitoring.tf b/terraform/modules/jaspersoft/monitoring.tf
@@ -1,7 +1,3 @@
-locals {
-  alarm_description_template = "Average instance %v utilization %v last %d minutes"
-}
-
 resource "aws_cloudwatch_metric_alarm" "cpu_utilisation_high" {
   alarm_name          = "jaspersoft-${var.environment}-cpu-high"
   comparison_operator = "GreaterThanThreshold"
@@ -12,7 +8,11 @@ resource "aws_cloudwatch_metric_alarm" "cpu_utilisation_high" {
   statistic           = "Average"
   threshold           = 80
 
-  alarm_description         = format(local.alarm_description_template, "CPU", "High", 10)
+  alarm_description         = <<EOF
+High CPU usage on JasperReports server.
+Non-critical, only affects Delta reports.
+Connect to the instance using Systems Manager Session Manager to investigate or reboot.
+  EOF
   alarm_actions             = [var.alarms_sns_topic_arn]
   ok_actions                = [var.alarms_sns_topic_arn]
   insufficient_data_actions = [var.alarms_sns_topic_arn]
@@ -30,7 +30,11 @@ resource "aws_cloudwatch_metric_alarm" "memory_utilisation_high" {
   statistic           = "Maximum"
   threshold           = 80
 
-  alarm_description         = format(local.alarm_description_template, "Memory Usage", "High", 10)
+  alarm_description         = <<EOF
+High memory usage on JasperReports server.
+Non-critical, only affects Delta reports.
+Connect to the instance using Systems Manager Session Manager to investigate or reboot.
+  EOF
   alarm_actions             = [var.alarms_sns_topic_arn]
   ok_actions                = [var.alarms_sns_topic_arn]
   insufficient_data_actions = [var.alarms_sns_topic_arn]
@@ -48,7 +52,11 @@ resource "aws_cloudwatch_metric_alarm" "disk_utilisation_high" {
   statistic           = "Maximum"
   threshold           = 80
 
-  alarm_description         = format(local.alarm_description_template, "Disk Usage", "High", 10)
+  alarm_description         = <<EOF
+High disk use on JasperReports server.
+Non-critical, only affects Delta reports.
+Connect to the instance using Systems Manager Session Manager to investigate.
+  EOF
   alarm_actions             = [var.alarms_sns_topic_arn]
   ok_actions                = [var.alarms_sns_topic_arn]
   insufficient_data_actions = [var.alarms_sns_topic_arn]
@@ -66,7 +74,10 @@ resource "aws_cloudwatch_metric_alarm" "limited_free_storage_space" {
   statistic           = "Minimum"
   threshold           = 3000000000 // 3 GB. At time of writing 7/10GB free.
 
-  alarm_description         = "Low storage space remaining on JasperReports RDS instance"
+  alarm_description         = <<EOF
+Low storage space remaining on JasperReports RDS instance.
+Non-critical, only affects Delta reports.
+  EOF
   alarm_actions             = [var.alarms_sns_topic_arn]
   ok_actions                = [var.alarms_sns_topic_arn]
   insufficient_data_actions = [var.alarms_sns_topic_arn]
@@ -85,7 +96,11 @@ resource "aws_cloudwatch_metric_alarm" "healthy_host_low" {
   statistic           = "Minimum"
   threshold           = 1
 
-  alarm_description  = "There are no healthy hosts"
+  alarm_description  = <<EOF
+The JasperReports server is unhealthy.
+Non-critical, only affects Delta reports.
+Connect to the instance using Systems Manager Session Manager to investigate or reboot.
+  EOF
   alarm_actions      = [var.alarms_sns_topic_arn]
   ok_actions         = [var.alarms_sns_topic_arn]
   treat_missing_data = "breaching"

diff --git a/terraform/modules/marklogic/monitoring_alarms.tf b/terraform/modules/marklogic/monitoring_alarms.tf
@@ -12,7 +12,11 @@ resource "aws_cloudwatch_metric_alarm" "cpu_utilisation_high" {
   statistic           = "Average"
   threshold           = 95 # TODO: DT-300 reduce this
 
-  alarm_description         = format(local.alarm_description_template, "CPU", "High", 10)
+  alarm_description         = <<EOF
+${format(local.alarm_description_template, "CPU", "High", 10)}
+This indicates MarkLogic is busy and will normally resolve on its own.
+If it persists consider clearing the task queue.
+  EOF
   alarm_actions             = [var.alarms_sns_topic_arn]
   ok_actions                = [var.alarms_sns_topic_arn]
   insufficient_data_actions = [var.alarms_sns_topic_arn]
@@ -32,7 +36,11 @@ resource "aws_cloudwatch_metric_alarm" "memory_utilisation_high" {
   statistic           = "Maximum"
   threshold           = 90
 
-  alarm_description         = format(local.alarm_description_template, "Memory Usage", "High", 10)
+  alarm_description         = <<EOF
+${format(local.alarm_description_template, "Memory Usage", "High", 10)}
+Monitor and then restart the cluster in an outage window.
+MarkLogic will crash and failover if it runs out of memory, which can take the whole cluster down.
+  EOF
   alarm_actions             = [var.alarms_sns_topic_arn]
   ok_actions                = [var.alarms_sns_topic_arn]
   insufficient_data_actions = [var.alarms_sns_topic_arn]
@@ -50,7 +58,11 @@ resource "aws_cloudwatch_metric_alarm" "memory_utilisation_high_sustained" {
   statistic           = "Maximum"
   threshold           = 85
 
-  alarm_description         = format(local.alarm_description_template, "Memory Usage", "High (sustained)", 25)
+  alarm_description         = <<EOF
+${format(local.alarm_description_template, "Memory Usage", "High (sustained)", 25)}
+The cluster may need to be restarted in an outage window.
+MarkLogic will crash and failover if it runs out of memory, which can take the whole cluster down.
+  EOF
   alarm_actions             = [var.alarms_sns_topic_arn]
   ok_actions                = [var.alarms_sns_topic_arn]
   insufficient_data_actions = [var.alarms_sns_topic_arn]
@@ -68,7 +80,12 @@ resource "aws_cloudwatch_metric_alarm" "system_disk_utilisation_high" {
   statistic           = "Maximum"
   threshold           = 90
 
-  alarm_description         = format(local.alarm_description_template, "Disk Usage", "High", 10)
+  alarm_description = <<EOF
+${format(local.alarm_description_template, "Disk Usage", "High", 10)}
+The system disk on one of the MarkLogic servers is nearly full. This disk should contain the OS only, not any MarkLogic data.
+Connect using Systems Manager to investigate.
+  EOF
+
   alarm_actions             = [var.alarms_sns_topic_arn]
   ok_actions                = [var.alarms_sns_topic_arn]
   insufficient_data_actions = [var.alarms_sns_topic_arn]
@@ -88,7 +105,13 @@ resource "aws_cloudwatch_metric_alarm" "data_disk_utilisation_high" {
   statistic           = "Maximum"
   threshold           = 90
 
-  alarm_description         = format(local.alarm_description_template, "Disk Usage", "High", 10)
+  alarm_description = <<EOF
+${format(local.alarm_description_template, "Disk Usage", "Very High", 10)}
+The disk containing MarkLogic database data on one of the MarkLogic servers is nearly full.
+Investigate what's causing the database to grow.
+Note that if you resize the EBS volume in AWS you will need to manually remount it on each server.
+  EOF
+
   alarm_actions             = [var.alarms_sns_topic_arn]
   ok_actions                = [var.alarms_sns_topic_arn]
   insufficient_data_actions = [var.alarms_sns_topic_arn]
@@ -108,7 +131,12 @@ resource "aws_cloudwatch_metric_alarm" "data_disk_utilisation_high_sustained" {
   statistic           = "Maximum"
   threshold           = var.data_disk_usage_alarm_threshold_percent
 
-  alarm_description         = format(local.alarm_description_template, "Disk Usage", "High", var.data_disk_usage_alarm_threshold_percent)
+  alarm_description = <<EOF
+${format(local.alarm_description_template, "Disk Usage", "High", var.data_disk_usage_alarm_threshold_percent)}
+The disk containing MarkLogic database data on one of the MarkLogic servers is starting to fill up.
+MarkLogic needs to have enough space to store two copies of any database it's restoring to, so high disk space utilisation will prevent restoring backups long before it is completely full.
+  EOF
+
   alarm_actions             = [var.alarms_sns_topic_arn]
   ok_actions                = [var.alarms_sns_topic_arn]
   insufficient_data_actions = [var.alarms_sns_topic_arn]
@@ -128,7 +156,12 @@ resource "aws_cloudwatch_metric_alarm" "unhealthy_host_high" {
   statistic           = "Maximum"
   threshold           = 0
 
-  alarm_description  = "There is at least one unhealthy host"
+  alarm_description  = <<EOF
+At least one of the MarkLogic servers is being reported as unhealthy by a load balancer.
+This is likely to significantly degrade Delta and should be escalated during business hours.
+Investigate in the MarkLogic admin console, or by connecting to a server using Systems Manager.
+The autoscaling groups will not replace servers that are unhealthy on the load balancer, only if EC2 reports them as unhealthy/offline.
+  EOF
   alarm_actions      = [var.alarms_sns_topic_arn]
   ok_actions         = [var.alarms_sns_topic_arn]
   treat_missing_data = "breaching"
@@ -153,6 +186,7 @@ resource "aws_cloudwatch_metric_alarm" "healthy_host_low" {
   alarm_description  = <<EOT
   There are fewer healthy MarkLogic hosts than expected.
   This is expected during weekly patching, but outside of that requires attention as MarkLogic often struggles to recover from node failure without manual intervention.
+  Investigate in the MarkLogic admin console, or by connecting to a server using Systems Manager.
   EOT
   alarm_actions      = [var.alarms_sns_topic_arn]
   ok_actions         = [var.alarms_sns_topic_arn]
@@ -173,7 +207,8 @@ resource "aws_cloudwatch_metric_alarm" "queue_length_high" {
   alarm_description  = <<EOT
   EBS Queue length is higher than expected for at least one node in the MarkLogic cluster.
   This means disk throughput is struggling to keep up.
-  This usually resolves itself, but if not check that the cluster healthy and not overloaded with too many tasks.
+  This is common when overnight jobs are running, and during backups/restores.
+  It usually resolves itself, but if not check that the cluster is healthy and clear the task queue if necessary.
   EOT
   alarm_actions      = [var.alarms_sns_topic_arn]
   ok_actions         = [var.alarms_sns_topic_arn]
@@ -215,7 +250,10 @@ resource "aws_cloudwatch_metric_alarm" "swap_usage_high" {
   statistic           = "Maximum"
   threshold           = 1
 
-  alarm_description  = "Swap usage percentage is higher than expected"
+  alarm_description  = <<EOT
+  Swap usage percentage is higher than expected.
+  This could indicate MarkLogic is low on memory and may need to be restarted immediately.
+  EOT
   alarm_actions      = [var.alarms_sns_topic_arn]
   ok_actions         = [var.alarms_sns_topic_arn]
   treat_missing_data = "notBreaching"
@@ -288,7 +326,7 @@ resource "aws_cloudwatch_metric_alarm" "time_since_delta_content_incremental_bac
   namespace           = "${var.environment}/MarkLogic"
   period              = 300
   statistic           = "Maximum"
-  threshold           = 900 //15 hours in minutes
+  threshold           = 900 // 15 hours in minutes
 
   alarm_description  = "Longer than expected since delta-content was incrementally backed up"
   alarm_actions      = [var.alarms_sns_topic_arn]
@@ -309,7 +347,10 @@ resource "aws_cloudwatch_metric_alarm" "task_server_queue_size_high" {
   statistic           = "Minimum"
   threshold           = 1000
 
-  alarm_description  = "Task server queue size is larger than expected"
+  alarm_description  = <<EOF
+Task server queue size is larger than expected.
+Consider clearing the task queue.
+  EOF
   alarm_actions      = [var.alarms_sns_topic_arn]
   ok_actions         = [var.alarms_sns_topic_arn]
   treat_missing_data = "missing"

diff --git a/terraform/modules/networking/firewall_monitoring.tf b/terraform/modules/networking/firewall_monitoring.tf
@@ -120,10 +120,10 @@ resource "aws_cloudwatch_metric_alarm" "dropped_packets" {
   statistic           = "Sum"
   threshold           = "1000"
   alarm_description   = <<EOF
-  Network Firewall dropping large number of packets.
-  Likely cause: Firewall misconfiguration.
-  Possible security issue: Could indicate a noisy network intrusion, e.g. outbound port scan.
-  Review the Network Firewall blocked requests log group "${aws_cloudwatch_log_group.firewall_alert.name}".
+Network Firewall dropping large number of packets.
+Likely cause: Firewall misconfiguration.
+Possible security issue: Could indicate a noisy network intrusion, e.g. outbound port scan.
+Review the Network Firewall blocked requests log group "${aws_cloudwatch_log_group.firewall_alert.name}" and escalate if unsure.
   EOF
   treat_missing_data  = "notBreaching"
   dimensions = {
@@ -146,10 +146,10 @@ resource "aws_cloudwatch_metric_alarm" "nat_bytes_out" {
   statistic           = "Sum"
   threshold           = "10000000" # 10MB. Edit the description if you increase this significantly.
   alarm_description   = <<EOF
-  Spike in outgoing network traffic through the NAT Gateway.
-  Likely cause: The threshold for this alarm is set low and probably needs increasing.
-  Possible security issue: Could indicate a data exfiltration attempt.
-  Review the Network Firewall allowed requests log group "${aws_cloudwatch_log_group.firewall_flow.name}".
+Spike in outgoing network traffic through the NAT Gateway.
+Likely cause: The threshold for this alarm is set low and probably needs increasing.
+Possible security issue: Could indicate a data exfiltration attempt.
+Review the Network Firewall allowed requests log group "${aws_cloudwatch_log_group.firewall_flow.name}" and escalate if unsure.
   EOF
   treat_missing_data  = "notBreaching"
   dimensions = {