Skip to content

Commit

Permalink
MLPAB-1663 - Continuously monitor dependency health checks (#922)
Browse files Browse the repository at this point in the history
* create dependency health check alarms

* create sns topic for service and dependency health checks per environment

* create all resources in the global region in order to support checks from multiple regions

* choose when to enable actions for health check alarms
  • Loading branch information
andrewpearce-digital authored Dec 14, 2023
1 parent e3ed697 commit 319c1a8
Show file tree
Hide file tree
Showing 9 changed files with 195 additions and 64 deletions.
21 changes: 0 additions & 21 deletions terraform/account/region/sns_topics.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,3 @@ resource "aws_sns_topic" "ecs_autoscaling_alarms" {
sqs_success_feedback_sample_rate = 100
provider = aws.region
}

resource "aws_sns_topic" "health_checks_global" {
name = "health-checks"
kms_master_key_id = data.aws_kms_alias.sns_kms_key_alias.target_key_id
application_failure_feedback_role_arn = data.aws_iam_role.sns_failure_feedback.arn
application_success_feedback_role_arn = data.aws_iam_role.sns_success_feedback.arn
application_success_feedback_sample_rate = 100
firehose_failure_feedback_role_arn = data.aws_iam_role.sns_failure_feedback.arn
firehose_success_feedback_role_arn = data.aws_iam_role.sns_success_feedback.arn
firehose_success_feedback_sample_rate = 100
http_failure_feedback_role_arn = data.aws_iam_role.sns_failure_feedback.arn
http_success_feedback_role_arn = data.aws_iam_role.sns_success_feedback.arn
http_success_feedback_sample_rate = 100
lambda_failure_feedback_role_arn = data.aws_iam_role.sns_failure_feedback.arn
lambda_success_feedback_role_arn = data.aws_iam_role.sns_success_feedback.arn
lambda_success_feedback_sample_rate = 100
sqs_failure_feedback_role_arn = data.aws_iam_role.sns_failure_feedback.arn
sqs_success_feedback_role_arn = data.aws_iam_role.sns_success_feedback.arn
sqs_success_feedback_sample_rate = 100
provider = aws.global
}
14 changes: 14 additions & 0 deletions terraform/environment/region/data_sources.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
data "aws_kms_alias" "sns_kms_key_alias_global" {
name = "alias/${data.aws_default_tags.current.tags.application}_sns_secret_encryption_key"
provider = aws.global
}

data "aws_iam_role" "sns_success_feedback" {
name = "SNSSuccessFeedback"
provider = aws.global
}

data "aws_iam_role" "sns_failure_feedback" {
name = "SNSFailureFeedback"
provider = aws.global
}
2 changes: 1 addition & 1 deletion terraform/environment/region/ecs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ module "app" {
aws_rum_guest_role_arn = data.aws_iam_role.rum_monitor_unauthenticated.arn
rum_monitor_application_id_secretsmanager_secret_arn = aws_secretsmanager_secret.rum_monitor_application_id.id
uid_base_url = var.uid_service.base_url
lpa_store_base_url = var.lpa_store_service.base_url
lpa_store_base_url = var.lpa_store_service.base_url
mock_onelogin_enabled = data.aws_default_tags.current.tags.environment-name != "production" && var.mock_onelogin_enabled
providers = {
aws.region = aws.region
Expand Down
142 changes: 142 additions & 0 deletions terraform/environment/region/health_checks.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
resource "aws_sns_topic" "service_health_checks_global" {
name = "${data.aws_default_tags.current.tags.environment-name}-service-health-checks"
kms_master_key_id = data.aws_kms_alias.sns_kms_key_alias_global.target_key_id
application_failure_feedback_role_arn = data.aws_iam_role.sns_failure_feedback.arn
application_success_feedback_role_arn = data.aws_iam_role.sns_success_feedback.arn
application_success_feedback_sample_rate = 100
firehose_failure_feedback_role_arn = data.aws_iam_role.sns_failure_feedback.arn
firehose_success_feedback_role_arn = data.aws_iam_role.sns_success_feedback.arn
firehose_success_feedback_sample_rate = 100
http_failure_feedback_role_arn = data.aws_iam_role.sns_failure_feedback.arn
http_success_feedback_role_arn = data.aws_iam_role.sns_success_feedback.arn
http_success_feedback_sample_rate = 100
lambda_failure_feedback_role_arn = data.aws_iam_role.sns_failure_feedback.arn
lambda_success_feedback_role_arn = data.aws_iam_role.sns_success_feedback.arn
lambda_success_feedback_sample_rate = 100
sqs_failure_feedback_role_arn = data.aws_iam_role.sns_failure_feedback.arn
sqs_success_feedback_role_arn = data.aws_iam_role.sns_success_feedback.arn
sqs_success_feedback_sample_rate = 100
provider = aws.global
}

resource "aws_route53_health_check" "service_health_check" {
fqdn = aws_route53_record.app.fqdn
reference_name = "${substr(data.aws_default_tags.current.tags.environment-name, 0, 20)}-service-hc"
port = 443
type = "HTTPS"
failure_threshold = 1
request_interval = 30
resource_path = "/health-check/service"
measure_latency = true
regions = ["us-east-1", "eu-west-1", "ap-southeast-1"]
tags = {
Name = "${data.aws_default_tags.current.tags.environment-name} service health check"
}
provider = aws.global
}

resource "aws_cloudwatch_metric_alarm" "service_health_check" {
alarm_description = "${data.aws_default_tags.current.tags.environment-name} service health check for"
alarm_name = "${data.aws_default_tags.current.tags.environment-name}-service-health-check-alarm"
alarm_actions = [aws_sns_topic.service_health_checks_global.arn]
ok_actions = [aws_sns_topic.service_health_checks_global.arn]
actions_enabled = var.service_health_check_alarm_enabled
comparison_operator = "LessThanThreshold"
datapoints_to_alarm = 1
evaluation_periods = 1
metric_name = "HealthCheckStatus"
namespace = "AWS/Route53"
period = 60
statistic = "Minimum"
threshold = 1
dimensions = {
HealthCheckId = aws_route53_health_check.service_health_check.id
}

provider = aws.global
}

resource "pagerduty_service_integration" "service_health_check" {
name = "Modernising LPA ${data.aws_default_tags.current.tags.environment-name} Service Health Check Alarm"
service = data.pagerduty_service.main.id
vendor = data.pagerduty_vendor.cloudwatch.id
}

resource "aws_sns_topic_subscription" "service_health_check" {
topic_arn = aws_sns_topic.service_health_checks_global.arn
protocol = "https"
endpoint_auto_confirms = true
endpoint = "https://events.pagerduty.com/integration/${pagerduty_service_integration.service_health_check.integration_key}/enqueue"
provider = aws.global
}

resource "aws_sns_topic" "dependency_health_checks_global" {
name = "${data.aws_default_tags.current.tags.environment-name}-dependency-health-checks"
kms_master_key_id = data.aws_kms_alias.sns_kms_key_alias_global.target_key_id
application_failure_feedback_role_arn = data.aws_iam_role.sns_failure_feedback.arn
application_success_feedback_role_arn = data.aws_iam_role.sns_success_feedback.arn
application_success_feedback_sample_rate = 100
firehose_failure_feedback_role_arn = data.aws_iam_role.sns_failure_feedback.arn
firehose_success_feedback_role_arn = data.aws_iam_role.sns_success_feedback.arn
firehose_success_feedback_sample_rate = 100
http_failure_feedback_role_arn = data.aws_iam_role.sns_failure_feedback.arn
http_success_feedback_role_arn = data.aws_iam_role.sns_success_feedback.arn
http_success_feedback_sample_rate = 100
lambda_failure_feedback_role_arn = data.aws_iam_role.sns_failure_feedback.arn
lambda_success_feedback_role_arn = data.aws_iam_role.sns_success_feedback.arn
lambda_success_feedback_sample_rate = 100
sqs_failure_feedback_role_arn = data.aws_iam_role.sns_failure_feedback.arn
sqs_success_feedback_role_arn = data.aws_iam_role.sns_success_feedback.arn
sqs_success_feedback_sample_rate = 100
provider = aws.global
}

resource "aws_route53_health_check" "dependency_health_check" {
fqdn = aws_route53_record.app.fqdn
reference_name = "${substr(data.aws_default_tags.current.tags.environment-name, 0, 20)}-dependency-hc"
port = 443
type = "HTTPS"
failure_threshold = 1
request_interval = 30
resource_path = "/health-check/dependency"
measure_latency = true
regions = ["us-east-1", "eu-west-1", "ap-southeast-1"]
tags = {
Name = "${data.aws_default_tags.current.tags.environment-name} dependency health check"
}
provider = aws.global
}

resource "aws_cloudwatch_metric_alarm" "dependency_health_check" {
alarm_description = "${data.aws_default_tags.current.tags.environment-name} dependency health check for}"
alarm_name = "${data.aws_default_tags.current.tags.environment-name}-dependency-health-check-alarm"
alarm_actions = [aws_sns_topic.dependency_health_checks_global.arn]
ok_actions = [aws_sns_topic.dependency_health_checks_global.arn]
actions_enabled = var.dependency_health_check_alarm_enabled
comparison_operator = "LessThanThreshold"
datapoints_to_alarm = 1
evaluation_periods = 1
metric_name = "HealthCheckStatus"
namespace = "AWS/Route53"
period = 60
statistic = "Minimum"
threshold = 1
dimensions = {
HealthCheckId = aws_route53_health_check.dependency_health_check.id
}
provider = aws.global
}

resource "pagerduty_service_integration" "dependency_health_check" {
name = "Modernising LPA ${data.aws_default_tags.current.tags.environment-name} Dependency Health Check Alarm"
service = data.pagerduty_service.main.id
vendor = data.pagerduty_vendor.cloudwatch.id
}

resource "aws_sns_topic_subscription" "dependency_health_check" {
topic_arn = aws_sns_topic.dependency_health_checks_global.arn
protocol = "https"
endpoint_auto_confirms = true
endpoint = "https://events.pagerduty.com/integration/${pagerduty_service_integration.service_health_check.integration_key}/enqueue"
provider = aws.global
}
34 changes: 0 additions & 34 deletions terraform/environment/region/healthcheck.tf

This file was deleted.

12 changes: 12 additions & 0 deletions terraform/environment/region/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,15 @@ variable "lpa_store_service" {
variable "mock_onelogin_enabled" {
type = bool
}

variable "dependency_health_check_alarm_enabled" {
type = bool
description = "Enable the dependency health check alert actions"
default = false
}

variable "service_health_check_alarm_enabled" {
type = bool
description = "Enable the service health check alert actions"
default = false
}
8 changes: 6 additions & 2 deletions terraform/environment/regions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ module "eu_west_1" {
base_url = local.environment.lpa_store_service.base_url
api_arns = local.environment.lpa_store_service.api_arns
}
mock_onelogin_enabled = local.environment.mock_onelogin_enabled
mock_onelogin_enabled = local.environment.mock_onelogin_enabled
dependency_health_check_alarm_enabled = local.environment.app.dependency_health_check_alarm_enabled
service_health_check_alarm_enabled = local.environment.app.service_health_check_alarm_enabled
providers = {
aws.region = aws.eu_west_1
aws.global = aws.global
Expand Down Expand Up @@ -109,7 +111,9 @@ module "eu_west_2" {
base_url = local.environment.lpa_store_service.base_url
api_arns = local.environment.lpa_store_service.api_arns
}
mock_onelogin_enabled = local.environment.mock_onelogin_enabled
mock_onelogin_enabled = local.environment.mock_onelogin_enabled
dependency_health_check_alarm_enabled = local.environment.app.dependency_health_check_alarm_enabled
service_health_check_alarm_enabled = local.environment.app.service_health_check_alarm_enabled
providers = {
aws.region = aws.eu_west_2
aws.global = aws.global
Expand Down
24 changes: 18 additions & 6 deletions terraform/environment/terraform.tfvars.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
"autoscaling": {
"minimum": 1,
"maximum": 3
}
},
"dependency_health_check_alarm_enabled": false,
"service_health_check_alarm_enabled": false
},
"mock_onelogin_enabled": false,
"uid_service": {
Expand Down Expand Up @@ -87,7 +89,9 @@
"autoscaling": {
"minimum": 1,
"maximum": 3
}
},
"dependency_health_check_alarm_enabled": true,
"service_health_check_alarm_enabled": true
},
"mock_onelogin_enabled": true,
"uid_service": {
Expand Down Expand Up @@ -157,7 +161,9 @@
"autoscaling": {
"minimum": 1,
"maximum": 3
}
},
"dependency_health_check_alarm_enabled": false,
"service_health_check_alarm_enabled": false
},
"mock_onelogin_enabled": true,
"uid_service": {
Expand Down Expand Up @@ -227,7 +233,9 @@
"autoscaling": {
"minimum": 1,
"maximum": 3
}
},
"dependency_health_check_alarm_enabled": false,
"service_health_check_alarm_enabled": false
},
"mock_onelogin_enabled": true,
"uid_service": {
Expand Down Expand Up @@ -297,7 +305,9 @@
"autoscaling": {
"minimum": 1,
"maximum": 3
}
},
"dependency_health_check_alarm_enabled": false,
"service_health_check_alarm_enabled": false
},
"mock_onelogin_enabled": false,
"uid_service": {
Expand Down Expand Up @@ -367,7 +377,9 @@
"autoscaling": {
"minimum": 1,
"maximum": 3
}
},
"dependency_health_check_alarm_enabled": true,
"service_health_check_alarm_enabled": true
},
"mock_onelogin_enabled": false,
"uid_service": {
Expand Down
2 changes: 2 additions & 0 deletions terraform/environment/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ variable "environments" {
minimum = number
maximum = number
})
dependency_health_check_alarm_enabled = bool
service_health_check_alarm_enabled = bool
})
mock_onelogin_enabled = bool
uid_service = object({
Expand Down

0 comments on commit 319c1a8

Please sign in to comment.