Skip to content

Commit

Permalink
Merge pull request #369 from communitiesuk/alarm-descriptions
Browse files Browse the repository at this point in the history
Add more detailed descriptions to CloudWatch alarms
  • Loading branch information
BenRamchandani authored Dec 20, 2023
2 parents fb4c2ec + 4eebe8d commit 39ccd75
Show file tree
Hide file tree
Showing 9 changed files with 138 additions and 45 deletions.
7 changes: 6 additions & 1 deletion terraform/modules/active_directory/ldap_nlb.tf
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,12 @@ resource "aws_cloudwatch_metric_alarm" "ldap_lb_healthy_count_low" {
threshold = 0
evaluation_periods = 1

alarm_description = "The Active Directory domain controller in use is unhealthy"
alarm_description = <<EOF
The Active Directory Domain Controller in use is unhealthy.
This can prevent logins and severely affect Delta + CPM.
We can swap to using the other DC by updating the ${aws_lb_target_group.ldap.name} and ${aws_lb_target_group.ldaps.name} target groups (we do not leave both in the target group long term as it can cause errors updating users).
The Domain Controllers are managed by AWS Directory Service, we have very limited visibility so if this does not resolve itself we will likely need to raise a ticket with AWS support.
EOF
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
treat_missing_data = "breaching"
Expand Down
12 changes: 10 additions & 2 deletions terraform/modules/cloudfront_alb_monitoring_instance/alb_alarms.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@ resource "aws_cloudwatch_metric_alarm" "alb_target_server_error_rate_alarm" {

threshold = var.alb_target_server_error_rate_alarm_threshold_percent

alarm_description = "High ALB target 5xx error rate"
alarm_description = <<EOF
High ALB target 5xx error rate for the ${var.prefix} app.
This means the load balancer is receiving a large number of 5xx responses from the application servers and may indicate a problem with the application.
Investigate the application's logs.
EOF
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
insufficient_data_actions = [var.alarms_sns_topic_arn]
Expand Down Expand Up @@ -47,7 +51,11 @@ resource "aws_cloudwatch_metric_alarm" "alb_target_client_error_rate_alarm" {

threshold = var.alb_target_client_error_rate_alarm_threshold_percent

alarm_description = "High ALB target 4xx error rate"
alarm_description = <<EOF
High ALB target 4xx error rate for the ${var.prefix} app.
This means the load balancer is receiving a large number of 4xx responses from the application servers and may indicate a problem with the application.
Investigate the application's logs.
EOF
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
insufficient_data_actions = [var.alarms_sns_topic_arn]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ provider "aws" {
region = "us-east-1"
}

locals {
alarm_description_template = "Average distribution %v %v last %d minutes"
}

# We need to enable enhanced monitoring to get 4xx, 5xx, OriginLatency & CacheHitRate (+ other metrics)
resource "aws_cloudfront_monitoring_subscription" "main" {
distribution_id = var.cloudfront_distribution_id
Expand All @@ -33,7 +29,11 @@ resource "aws_cloudwatch_metric_alarm" "client_error_rate_alarm" {
threshold = var.cloudfront_client_error_rate_alarm_threshold_percent
treat_missing_data = "notBreaching" # Data is missing if there are no requests

alarm_description = format(local.alarm_description_template, "Error Rate", "High", var.cloudfront_metric_period_seconds * 2 / 60)
alarm_description = <<EOF
The ${var.prefix} CloudFront distribution is returning a large number of 4xx errors to users.
This usually indicates the application itself is returning 4xx errors, but could be due to CloudFront rejecting the requests for another reason.
Look at the application/ALB metrics first, then investigate the application itself or the WAF logs in CloudWatch/CloudFront logs in S3 as appropriate.
EOF
alarm_actions = [var.alarms_sns_topic_global_arn]
ok_actions = [var.alarms_sns_topic_global_arn]

Expand Down Expand Up @@ -83,7 +83,11 @@ resource "aws_cloudwatch_metric_alarm" "server_error_rate_alarm" {
threshold = var.cloudfront_server_error_rate_alarm_threshold_percent
treat_missing_data = "notBreaching" # Data is missing if there are no requests

alarm_description = format(local.alarm_description_template, "Error Rate", "High", var.cloudfront_metric_period_seconds * 2 / 60)
alarm_description = <<EOF
The ${var.prefix} CloudFront distribution is returning a large number of 5xx errors to users.
This usually indicates the application itself is returning 5xx errors, but could be due to CloudFront rejecting the requests for another reason, or not being able to reach the load balancer.
Look at the application/ALB metrics first to determine whether the issue is with the application or CloudFront, then investigate the application itself or the CloudFront logs in S3 as appropriate.
EOF
alarm_actions = [var.alarms_sns_topic_global_arn]
ok_actions = [var.alarms_sns_topic_global_arn]

Expand Down Expand Up @@ -137,7 +141,10 @@ resource "aws_cloudwatch_metric_alarm" "origin_latency_high_alarm" {
threshold = var.cloudfront_average_origin_latency_high_alarm_threshold_ms
treat_missing_data = "notBreaching" # Data is missing if there are no requests

alarm_description = format(local.alarm_description_template, "Origin Latency", "High", var.cloudfront_metric_period_seconds * 2 / 60)
alarm_description = <<EOF
The ${var.prefix} CloudFront distribution is reporting that requests to its origin (the ALB behind it) are taking longer than expected on average.
This can be a false alarm if only a small number of users are using the service, so it can be ignored outside of business hours.
EOF
alarm_actions = [var.alarms_sns_topic_global_arn]
ok_actions = [var.alarms_sns_topic_global_arn]

Expand All @@ -162,7 +169,10 @@ resource "aws_cloudwatch_metric_alarm" "origin_latency_p90_high_alarm" {
threshold = var.cloudfront_p90_origin_latency_high_alarm_threshold_ms
treat_missing_data = "notBreaching" # Data is missing if there are no requests

alarm_description = format(local.alarm_description_template, "Origin Latency P90", "High", var.cloudfront_metric_period_seconds * 2 / 60)
alarm_description = <<EOF
The ${var.prefix} CloudFront distribution is reporting that requests to its origin (the ALB behind it) are taking longer than expected (90th percentile > ${var.cloudfront_p90_origin_latency_high_alarm_threshold_ms}ms).
This can be a false alarm if only a small number of users are using the service, so it can be ignored outside of business hours.
EOF
alarm_actions = [var.alarms_sns_topic_global_arn]
ok_actions = [var.alarms_sns_topic_global_arn]

Expand All @@ -184,7 +194,7 @@ resource "aws_cloudwatch_metric_alarm" "ddos_attack" {
period = "60"
statistic = "Average"
threshold = "0"
alarm_description = "Triggers when AWS Shield Advanced detects a DDoS attack"
alarm_description = "Triggers when AWS Shield Advanced detects a DDoS attack. Escalate immediately."
treat_missing_data = "notBreaching"
alarm_actions = [var.security_sns_topic_global_arn]
ok_actions = [var.security_sns_topic_global_arn]
Expand Down
33 changes: 24 additions & 9 deletions terraform/modules/jaspersoft/monitoring.tf
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
locals {
alarm_description_template = "Average instance %v utilization %v last %d minutes"
}

resource "aws_cloudwatch_metric_alarm" "cpu_utilisation_high" {
alarm_name = "jaspersoft-${var.environment}-cpu-high"
comparison_operator = "GreaterThanThreshold"
Expand All @@ -12,7 +8,11 @@ resource "aws_cloudwatch_metric_alarm" "cpu_utilisation_high" {
statistic = "Average"
threshold = 80

alarm_description = format(local.alarm_description_template, "CPU", "High", 10)
alarm_description = <<EOF
High CPU usage on JasperReports server.
Non-critical, only affects Delta reports.
Connect to the instance using Systems Manager Session Manager to investigate or reboot.
EOF
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
insufficient_data_actions = [var.alarms_sns_topic_arn]
Expand All @@ -30,7 +30,11 @@ resource "aws_cloudwatch_metric_alarm" "memory_utilisation_high" {
statistic = "Maximum"
threshold = 80

alarm_description = format(local.alarm_description_template, "Memory Usage", "High", 10)
alarm_description = <<EOF
High memory usage on JasperReports server.
Non-critical, only affects Delta reports.
Connect to the instance using Systems Manager Session Manager to investigate or reboot.
EOF
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
insufficient_data_actions = [var.alarms_sns_topic_arn]
Expand All @@ -48,7 +52,11 @@ resource "aws_cloudwatch_metric_alarm" "disk_utilisation_high" {
statistic = "Maximum"
threshold = 80

alarm_description = format(local.alarm_description_template, "Disk Usage", "High", 10)
alarm_description = <<EOF
High disk use on JasperReports server.
Non-critical, only affects Delta reports.
Connect to the instance using Systems Manager Session Manager to investigate.
EOF
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
insufficient_data_actions = [var.alarms_sns_topic_arn]
Expand All @@ -66,7 +74,10 @@ resource "aws_cloudwatch_metric_alarm" "limited_free_storage_space" {
statistic = "Minimum"
threshold = 3000000000 // 3 GB. At time of writing 7/10GB free.

alarm_description = "Low storage space remaining on JasperReports RDS instance"
alarm_description = <<EOF
Low storage space remaining on JasperReports RDS instance.
Non-critical, only affects Delta reports.
EOF
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
insufficient_data_actions = [var.alarms_sns_topic_arn]
Expand All @@ -85,7 +96,11 @@ resource "aws_cloudwatch_metric_alarm" "healthy_host_low" {
statistic = "Minimum"
threshold = 1

alarm_description = "There are no healthy hosts"
alarm_description = <<EOF
The JasperReports server is unhealthy.
Non-critical, only affects Delta reports.
Connect to the instance using Systems Manager Session Manager to investigate or reboot.
EOF
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
treat_missing_data = "breaching"
Expand Down
63 changes: 52 additions & 11 deletions terraform/modules/marklogic/monitoring_alarms.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@ resource "aws_cloudwatch_metric_alarm" "cpu_utilisation_high" {
statistic = "Average"
threshold = 95 # TODO: DT-300 reduce this

alarm_description = format(local.alarm_description_template, "CPU", "High", 10)
alarm_description = <<EOF
${format(local.alarm_description_template, "CPU", "High", 10)}
This indicates MarkLogic is busy and will normally resolve on its own.
If it persists consider clearing the task queue.
EOF
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
insufficient_data_actions = [var.alarms_sns_topic_arn]
Expand All @@ -32,7 +36,11 @@ resource "aws_cloudwatch_metric_alarm" "memory_utilisation_high" {
statistic = "Maximum"
threshold = 90

alarm_description = format(local.alarm_description_template, "Memory Usage", "High", 10)
alarm_description = <<EOF
${format(local.alarm_description_template, "Memory Usage", "High", 10)}
Monitor and then restart the cluster in an outage window.
MarkLogic will crash and failover if it runs out of memory, which can take the whole cluster down.
EOF
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
insufficient_data_actions = [var.alarms_sns_topic_arn]
Expand All @@ -50,7 +58,11 @@ resource "aws_cloudwatch_metric_alarm" "memory_utilisation_high_sustained" {
statistic = "Maximum"
threshold = 85

alarm_description = format(local.alarm_description_template, "Memory Usage", "High (sustained)", 25)
alarm_description = <<EOF
${format(local.alarm_description_template, "Memory Usage", "High (sustained)", 25)}
The cluster may need to be restarted in an outage window.
MarkLogic will crash and failover if it runs out of memory, which can take the whole cluster down.
EOF
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
insufficient_data_actions = [var.alarms_sns_topic_arn]
Expand All @@ -68,7 +80,12 @@ resource "aws_cloudwatch_metric_alarm" "system_disk_utilisation_high" {
statistic = "Maximum"
threshold = 90

alarm_description = format(local.alarm_description_template, "Disk Usage", "High", 10)
alarm_description = <<EOF
${format(local.alarm_description_template, "Disk Usage", "High", 10)}
The system disk on one of the MarkLogic servers is nearly full. This disk should contain the OS only, not any MarkLogic data.
Connect using Systems Manager to investigate.
EOF

alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
insufficient_data_actions = [var.alarms_sns_topic_arn]
Expand All @@ -88,7 +105,13 @@ resource "aws_cloudwatch_metric_alarm" "data_disk_utilisation_high" {
statistic = "Maximum"
threshold = 90

alarm_description = format(local.alarm_description_template, "Disk Usage", "High", 10)
alarm_description = <<EOF
${format(local.alarm_description_template, "Disk Usage", "Very High", 10)}
The disk containing MarkLogic database data on one of the MarkLogic servers is nearly full.
Investigate what's causing the database to grow.
Note that if you resize the EBS volume in AWS you will need to manually remount it on each server.
EOF

alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
insufficient_data_actions = [var.alarms_sns_topic_arn]
Expand All @@ -108,7 +131,12 @@ resource "aws_cloudwatch_metric_alarm" "data_disk_utilisation_high_sustained" {
statistic = "Maximum"
threshold = var.data_disk_usage_alarm_threshold_percent

alarm_description = format(local.alarm_description_template, "Disk Usage", "High", var.data_disk_usage_alarm_threshold_percent)
alarm_description = <<EOF
${format(local.alarm_description_template, "Disk Usage", "High", var.data_disk_usage_alarm_threshold_percent)}
The disk containing MarkLogic database data on one of the MarkLogic servers is starting to fill up.
MarkLogic needs to have enough space to store two copies of any database it's restoring to, so high disk space utilisation will prevent restoring backups long before it is completely full.
EOF

alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
insufficient_data_actions = [var.alarms_sns_topic_arn]
Expand All @@ -128,7 +156,12 @@ resource "aws_cloudwatch_metric_alarm" "unhealthy_host_high" {
statistic = "Maximum"
threshold = 0

alarm_description = "There is at least one unhealthy host"
alarm_description = <<EOF
At least one of the MarkLogic servers is being reported as unhealthy by a load balancer.
This is likely to significantly degrade Delta and should be escalated during business hours.
Investigate in the MarkLogic admin console, or by connecting to a server using Systems Manager.
The autoscaling groups will not replace servers that are unhealthy on the load balancer, only if EC2 reports them as unhealthy/offline.
EOF
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
treat_missing_data = "breaching"
Expand All @@ -153,6 +186,7 @@ resource "aws_cloudwatch_metric_alarm" "healthy_host_low" {
alarm_description = <<EOT
There are fewer healthy MarkLogic hosts than expected.
This is expected during weekly patching, but outside of that requires attention as MarkLogic often struggles to recover from node failure without manual intervention.
Investigate in the MarkLogic admin console, or by connecting to a server using Systems Manager.
EOT
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
Expand All @@ -173,7 +207,8 @@ resource "aws_cloudwatch_metric_alarm" "queue_length_high" {
alarm_description = <<EOT
EBS Queue length is higher than expected for at least one node in the MarkLogic cluster.
This means disk throughput is struggling to keep up.
This usually resolves itself, but if not check that the cluster healthy and not overloaded with too many tasks.
This is common when overnight jobs are running, and during backups/restores.
It usually resolves itself, but if not check that the cluster is healthy and clear the task queue if necessary.
EOT
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
Expand Down Expand Up @@ -215,7 +250,10 @@ resource "aws_cloudwatch_metric_alarm" "swap_usage_high" {
statistic = "Maximum"
threshold = 1

alarm_description = "Swap usage percentage is higher than expected"
alarm_description = <<EOT
Swap usage percentage is higher than expected.
This could indicate MarkLogic is low on memory and may need to be restarted immediately.
EOT
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
treat_missing_data = "notBreaching"
Expand Down Expand Up @@ -288,7 +326,7 @@ resource "aws_cloudwatch_metric_alarm" "time_since_delta_content_incremental_bac
namespace = "${var.environment}/MarkLogic"
period = 300
statistic = "Maximum"
threshold = 900 //15 hours in minutes
threshold = 900 // 15 hours in minutes

alarm_description = "Longer than expected since delta-content was incrementally backed up"
alarm_actions = [var.alarms_sns_topic_arn]
Expand All @@ -309,7 +347,10 @@ resource "aws_cloudwatch_metric_alarm" "task_server_queue_size_high" {
statistic = "Minimum"
threshold = 1000

alarm_description = "Task server queue size is larger than expected"
alarm_description = <<EOF
Task server queue size is larger than expected.
Consider clearing the task queue.
EOF
alarm_actions = [var.alarms_sns_topic_arn]
ok_actions = [var.alarms_sns_topic_arn]
treat_missing_data = "missing"
Expand Down
16 changes: 8 additions & 8 deletions terraform/modules/networking/firewall_monitoring.tf
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,10 @@ resource "aws_cloudwatch_metric_alarm" "dropped_packets" {
statistic = "Sum"
threshold = "1000"
alarm_description = <<EOF
Network Firewall dropping large number of packets.
Likely cause: Firewall misconfiguration.
Possible security issue: Could indicate a noisy network intrusion, e.g. outbound port scan.
Review the Network Firewall blocked requests log group "${aws_cloudwatch_log_group.firewall_alert.name}".
Network Firewall dropping large number of packets.
Likely cause: Firewall misconfiguration.
Possible security issue: Could indicate a noisy network intrusion, e.g. outbound port scan.
Review the Network Firewall blocked requests log group "${aws_cloudwatch_log_group.firewall_alert.name}" and escalate if unsure.
EOF
treat_missing_data = "notBreaching"
dimensions = {
Expand All @@ -146,10 +146,10 @@ resource "aws_cloudwatch_metric_alarm" "nat_bytes_out" {
statistic = "Sum"
threshold = "10000000" # 10MB. Edit the description if you increase this significantly.
alarm_description = <<EOF
Spike in outgoing network traffic through the NAT Gateway.
Likely cause: The threshold for this alarm is set low and probably needs increasing.
Possible security issue: Could indicate a data exfiltration attempt.
Review the Network Firewall allowed requests log group "${aws_cloudwatch_log_group.firewall_flow.name}".
Spike in outgoing network traffic through the NAT Gateway.
Likely cause: The threshold for this alarm is set low and probably needs increasing.
Possible security issue: Could indicate a data exfiltration attempt.
Review the Network Firewall allowed requests log group "${aws_cloudwatch_log_group.firewall_flow.name}" and escalate if unsure.
EOF
treat_missing_data = "notBreaching"
dimensions = {
Expand Down
Loading

0 comments on commit 39ccd75

Please sign in to comment.