From 39d7b0750c8f75a9c26c0b8101999ff0c0b8998b Mon Sep 17 00:00:00 2001
From: "simeon.zhekov" <simeon.zhekov@ontotext.com>
Date: Fri, 6 Dec 2024 16:56:38 +0200
Subject: [PATCH] Added wait condition until the node count equals the desired
 capacity.

---
 modules/graphdb/main.tf                       |  6 ++
 modules/graphdb/templates/00_functions.sh     | 66 ++++++++++++++++++-
 .../templates/01_wait_node_count.sh.tpl       |  9 +++
 modules/graphdb/user_data.tf                  |  1 +
 modules/graphdb/variables.tf                  | 12 ++++
 5 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/modules/graphdb/main.tf b/modules/graphdb/main.tf
index 507c74a..4bf85f9 100644
--- a/modules/graphdb/main.tf
+++ b/modules/graphdb/main.tf
@@ -87,6 +87,11 @@ resource "aws_autoscaling_group" "graphdb_auto_scaling_group" {
 
   target_group_arns = var.graphdb_target_group_arns
 
+  instance_maintenance_policy {
+    min_healthy_percentage = var.instance_maintenance_policy_min_healthy_percentage
+    max_healthy_percentage = var.instance_maintenance_policy_max_healthy_percentage
+  }
+
   launch_template {
     id      = aws_launch_template.graphdb.id
     version = aws_launch_template.graphdb.latest_version
@@ -119,3 +124,4 @@ resource "aws_autoscaling_group" "graphdb_auto_scaling_group" {
     }
   }
 }
+
diff --git a/modules/graphdb/templates/00_functions.sh b/modules/graphdb/templates/00_functions.sh
index 01d10cf..affbbb9 100644
--- a/modules/graphdb/templates/00_functions.sh
+++ b/modules/graphdb/templates/00_functions.sh
@@ -1,12 +1,72 @@
 #!/usr/bin/env bash
 
-# Generic helper functions
-
-# Function to print messages with timestamps
+# Function to log messages with a timestamp
 log_with_timestamp() {
   echo "$(date '+%Y-%m-%d %H:%M:%S'): $1"
 }
 
+# Function to check ASG node counts
+wait_for_asg_nodes() {
+  local ASG_NAME="$1"
+  local RETRY_DELAY=10
+  local MAX_RETRIES=30
+  local RETRY_COUNT=0
+
+  # Get the desired capacity of the ASG
+  local NODE_COUNT
+  NODE_COUNT=$(aws autoscaling describe-auto-scaling-groups \
+    --auto-scaling-group-names "$ASG_NAME" \
+    --query "AutoScalingGroups[0].DesiredCapacity" \
+    --output text)
+
+  # Check if NODE_COUNT is not an integer
+  if ! [[ "$NODE_COUNT" =~ ^[0-9]+$ ]]; then
+  log_with_timestamp "Error: Unable to retrieve valid Desired Capacity for ASG: $ASG_NAME. Received value: $NODE_COUNT."
+  exit 1
+  fi
+
+  log_with_timestamp "Checking ASG node count for $ASG_NAME with desired node count: $NODE_COUNT"
+
+  while true; do
+    # Check InService and Terminating states via ASG
+    local IN_SERVICE_NODE_COUNT
+    IN_SERVICE_NODE_COUNT=$(aws autoscaling describe-auto-scaling-groups \
+      --auto-scaling-group-names "$ASG_NAME" \
+      --query "AutoScalingGroups[0].Instances[?LifecycleState=='InService'] | length(@)" \
+      --output text)
+
+    local TERMINATING_NODE_COUNT
+    TERMINATING_NODE_COUNT=$(aws autoscaling describe-auto-scaling-groups \
+      --auto-scaling-group-names "$ASG_NAME" \
+      --query "AutoScalingGroups[0].Instances[?LifecycleState=='Terminating'] | length(@)" \
+      --output text)
+
+    local SHUTTING_DOWN_NODE_COUNT
+    SHUTTING_DOWN_NODE_COUNT=$(aws ec2 describe-instances \
+      --filters "Name=instance-state-name,Values=shutting-down" \
+      --query "Reservations[].Instances[].InstanceId | length(@)" \
+      --output text)
+
+    log_with_timestamp "InService: $IN_SERVICE_NODE_COUNT, Terminating: $TERMINATING_NODE_COUNT, Shutting-down: $SHUTTING_DOWN_NODE_COUNT, Desired: $NODE_COUNT"
+
+    if [[ -z "$IN_SERVICE_NODE_COUNT" || "$IN_SERVICE_NODE_COUNT" -le "$NODE_COUNT" ]] \
+      && [[ "$TERMINATING_NODE_COUNT" -eq 0 ]] \
+      && [[ "$SHUTTING_DOWN_NODE_COUNT" -eq 0 ]]; then
+      log_with_timestamp "Conditions met: InService <= $NODE_COUNT, no Terminating, no Shutting-down. Proceeding..."
+      break
+    else
+      if [ "$RETRY_COUNT" -ge "$MAX_RETRIES" ]; then
+        log_with_timestamp "Error: Maximum retry attempts reached. Exiting..."
+        exit 1
+      fi
+
+      log_with_timestamp "Conditions not met. Waiting... (InService: $IN_SERVICE_NODE_COUNT, Terminating: $TERMINATING_NODE_COUNT, Shutting-down: $SHUTTING_DOWN_NODE_COUNT)"
+      sleep "$RETRY_DELAY"
+      RETRY_COUNT=$((RETRY_COUNT + 1))
+    fi
+  done
+}
+
 # Function which waits for all DNS records to be created
 wait_dns_records() {
   local ZONE_ID="$1"
diff --git a/modules/graphdb/templates/01_wait_node_count.sh.tpl b/modules/graphdb/templates/01_wait_node_count.sh.tpl
index 1ba088d..e0f1f8d 100644
--- a/modules/graphdb/templates/01_wait_node_count.sh.tpl
+++ b/modules/graphdb/templates/01_wait_node_count.sh.tpl
@@ -23,6 +23,15 @@ echo "#####################################################"
 IMDS_TOKEN=$(curl -Ss -H "X-aws-ec2-metadata-token-ttl-seconds: 6000" -XPUT 169.254.169.254/latest/api/token)
 AZ=$(curl -Ss -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" 169.254.169.254/latest/meta-data/placement/availability-zone)
 ASG_NAME=${name}
+GRAPHDB_NODE_COUNT=${node_count}
+
+# Only run the wait_asg_nodes function if graphdb_node_count is more than 1
+if [ "$GRAPHDB_NODE_COUNT" -gt 1 ]; then
+  echo "GraphDB node count is greater than 1. Running wait_asg_nodes..."
+  wait_for_asg_nodes "$ASG_NAME"
+else
+  echo "GraphDB node count is 1 or less. Skipping wait_asg_nodes."
+fi
 
 instance_refresh_status=$(aws autoscaling describe-instance-refreshes --auto-scaling-group-name "$ASG_NAME" --query 'InstanceRefreshes[?Status==`InProgress`]' --output json)
 
diff --git a/modules/graphdb/user_data.tf b/modules/graphdb/user_data.tf
index 9346e01..abc6dc2 100644
--- a/modules/graphdb/user_data.tf
+++ b/modules/graphdb/user_data.tf
@@ -30,6 +30,7 @@ data "cloudinit_config" "graphdb_user_data" {
     content_type = "text/x-shellscript"
     content = templatefile("${path.module}/templates/01_wait_node_count.sh.tpl", {
       name : var.resource_name_prefix
+      node_count : var.graphdb_node_count
     })
   }
 
diff --git a/modules/graphdb/variables.tf b/modules/graphdb/variables.tf
index c6f894c..8c90835 100644
--- a/modules/graphdb/variables.tf
+++ b/modules/graphdb/variables.tf
@@ -394,3 +394,15 @@ variable "ebs_default_kms_key" {
   description = "Define default KMS key"
   type        = string
 }
+
+variable "instance_maintenance_policy_min_healthy_percentage" {
+  description = "Define minimum healthy percentage for the Instance Maintenance Policy"
+  type        = number
+  default     = 66
+}
+
+variable "instance_maintenance_policy_max_healthy_percentage" {
+  description = "Define maximum healthy percentage for the Instance Maintenance Policy"
+  type        = number
+  default     = 100
+}