From 39d7b0750c8f75a9c26c0b8101999ff0c0b8998b Mon Sep 17 00:00:00 2001 From: "simeon.zhekov" Date: Fri, 6 Dec 2024 16:56:38 +0200 Subject: [PATCH] Added wait condition until the node count equals the desired capacity. --- modules/graphdb/main.tf | 6 ++ modules/graphdb/templates/00_functions.sh | 66 ++++++++++++++++++- .../templates/01_wait_node_count.sh.tpl | 9 +++ modules/graphdb/user_data.tf | 1 + modules/graphdb/variables.tf | 12 ++++ 5 files changed, 91 insertions(+), 3 deletions(-) diff --git a/modules/graphdb/main.tf b/modules/graphdb/main.tf index 507c74a..4bf85f9 100644 --- a/modules/graphdb/main.tf +++ b/modules/graphdb/main.tf @@ -87,6 +87,11 @@ resource "aws_autoscaling_group" "graphdb_auto_scaling_group" { target_group_arns = var.graphdb_target_group_arns + instance_maintenance_policy { + min_healthy_percentage = var.instance_maintenance_policy_min_healthy_percentage + max_healthy_percentage = var.instance_maintenance_policy_max_healthy_percentage + } + launch_template { id = aws_launch_template.graphdb.id version = aws_launch_template.graphdb.latest_version @@ -119,3 +124,4 @@ resource "aws_autoscaling_group" "graphdb_auto_scaling_group" { } } } + diff --git a/modules/graphdb/templates/00_functions.sh b/modules/graphdb/templates/00_functions.sh index 01d10cf..affbbb9 100644 --- a/modules/graphdb/templates/00_functions.sh +++ b/modules/graphdb/templates/00_functions.sh @@ -1,12 +1,72 @@ #!/usr/bin/env bash -# Generic helper functions - -# Function to print messages with timestamps +# Function to log messages with a timestamp log_with_timestamp() { echo "$(date '+%Y-%m-%d %H:%M:%S'): $1" } +# Function to check ASG node counts +wait_for_asg_nodes() { + local ASG_NAME="$1" + local RETRY_DELAY=10 + local MAX_RETRIES=30 + local RETRY_COUNT=0 + + # Get the desired capacity of the ASG + local NODE_COUNT + NODE_COUNT=$(aws autoscaling describe-auto-scaling-groups \ + --auto-scaling-group-names "$ASG_NAME" \ + --query "AutoScalingGroups[0].DesiredCapacity" \ + --output text) + + # Check if NODE_COUNT is not an integer + if ! [[ "$NODE_COUNT" =~ ^[0-9]+$ ]]; then + log_with_timestamp "Error: Unable to retrieve valid Desired Capacity for ASG: $ASG_NAME. Received value: $NODE_COUNT." + exit 1 + fi + + log_with_timestamp "Checking ASG node count for $ASG_NAME with desired node count: $NODE_COUNT" + + while true; do + # Check InService and Terminating states via ASG + local IN_SERVICE_NODE_COUNT + IN_SERVICE_NODE_COUNT=$(aws autoscaling describe-auto-scaling-groups \ + --auto-scaling-group-names "$ASG_NAME" \ + --query "AutoScalingGroups[0].Instances[?LifecycleState=='InService'] | length(@)" \ + --output text) + + local TERMINATING_NODE_COUNT + TERMINATING_NODE_COUNT=$(aws autoscaling describe-auto-scaling-groups \ + --auto-scaling-group-names "$ASG_NAME" \ + --query "AutoScalingGroups[0].Instances[?LifecycleState=='Terminating'] | length(@)" \ + --output text) + + local SHUTTING_DOWN_NODE_COUNT + SHUTTING_DOWN_NODE_COUNT=$(aws ec2 describe-instances \ + --filters "Name=instance-state-name,Values=shutting-down" \ + --query "Reservations[].Instances[].InstanceId | length(@)" \ + --output text) + + log_with_timestamp "InService: $IN_SERVICE_NODE_COUNT, Terminating: $TERMINATING_NODE_COUNT, Shutting-down: $SHUTTING_DOWN_NODE_COUNT, Desired: $NODE_COUNT" + + if [[ -z "$IN_SERVICE_NODE_COUNT" || "$IN_SERVICE_NODE_COUNT" -le "$NODE_COUNT" ]] \ + && [[ "$TERMINATING_NODE_COUNT" -eq 0 ]] \ + && [[ "$SHUTTING_DOWN_NODE_COUNT" -eq 0 ]]; then + log_with_timestamp "Conditions met: InService <= $NODE_COUNT, no Terminating, no Shutting-down. Proceeding..." + break + else + if [ "$RETRY_COUNT" -ge "$MAX_RETRIES" ]; then + log_with_timestamp "Error: Maximum retry attempts reached. Exiting..." + exit 1 + fi + + log_with_timestamp "Conditions not met. Waiting... (InService: $IN_SERVICE_NODE_COUNT, Terminating: $TERMINATING_NODE_COUNT, Shutting-down: $SHUTTING_DOWN_NODE_COUNT)" + sleep "$RETRY_DELAY" + RETRY_COUNT=$((RETRY_COUNT + 1)) + fi + done +} + # Function which waits for all DNS records to be created wait_dns_records() { local ZONE_ID="$1" diff --git a/modules/graphdb/templates/01_wait_node_count.sh.tpl b/modules/graphdb/templates/01_wait_node_count.sh.tpl index 1ba088d..e0f1f8d 100644 --- a/modules/graphdb/templates/01_wait_node_count.sh.tpl +++ b/modules/graphdb/templates/01_wait_node_count.sh.tpl @@ -23,6 +23,15 @@ echo "#####################################################" IMDS_TOKEN=$(curl -Ss -H "X-aws-ec2-metadata-token-ttl-seconds: 6000" -XPUT 169.254.169.254/latest/api/token) AZ=$(curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/placement/availability-zone) ASG_NAME=${name} +GRAPHDB_NODE_COUNT=${node_count} + +# Only run the wait_asg_nodes function if graphdb_node_count is more than 1 +if [ "$GRAPHDB_NODE_COUNT" -gt 1 ]; then + echo "GraphDB node count is greater than 1. Running wait_asg_nodes..." + wait_for_asg_nodes "$ASG_NAME" +else + echo "GraphDB node count is 1 or less. Skipping wait_asg_nodes." +fi instance_refresh_status=$(aws autoscaling describe-instance-refreshes --auto-scaling-group-name "$ASG_NAME" --query 'InstanceRefreshes[?Status==`InProgress`]' --output json) diff --git a/modules/graphdb/user_data.tf b/modules/graphdb/user_data.tf index 9346e01..abc6dc2 100644 --- a/modules/graphdb/user_data.tf +++ b/modules/graphdb/user_data.tf @@ -30,6 +30,7 @@ data "cloudinit_config" "graphdb_user_data" { content_type = "text/x-shellscript" content = templatefile("${path.module}/templates/01_wait_node_count.sh.tpl", { name : var.resource_name_prefix + node_count : var.graphdb_node_count }) } diff --git a/modules/graphdb/variables.tf b/modules/graphdb/variables.tf index c6f894c..8c90835 100644 --- a/modules/graphdb/variables.tf +++ b/modules/graphdb/variables.tf @@ -394,3 +394,15 @@ variable "ebs_default_kms_key" { description = "Define default KMS key" type = string } + +variable "instance_maintenance_policy_min_healthy_percentage" { + description = "Define minimum healthy percentage for the Instance Maintenance Policy" + type = number + default = 66 +} + +variable "instance_maintenance_policy_max_healthy_percentage" { + description = "Define maximum healthy percentage for the Instance Maintenance Policy" + type = number + default = 100 +}