Updated the Backup script to work with single node

Removed hardcoded values for DNS zone in userdata scripts Updated the NSGs ports based on graphdb_node_count Updated user_data.tf Changed the name of the LB target group to avoid conflicts when scaling from 1 to 3 AZs Updated the monitoring, to not deploy cluster alarms when a single node is deployed Updated the availability_tests Updated how the VPC azs are calculated based on the graphdb_node_count Added calculations for the subnets based on the graphdb_node_count Removed route53_availability_content_match from modules\monitoring Moved route53_availability_http_string_type to root level. Changed the availability test to support single node deployment. Added dynamic change of the availability tests http string type based on tls_enabled Updated the README.md Updated CHANGELOG.md
Ontotext-AD · Jul 2, 2024 · be3b0f1 · be3b0f1
1 parent c04e033
commit be3b0f1
Show file tree

Hide file tree

Showing 17 changed files with 400 additions and 68 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # GraphDB AWS Terraform Module Changelog
 
+# 1.2.0
+* Added support for single node deployment
+  * Added new userdata script `10_start_graphdb_services.sh.tpl` for single node setup.
+  * Made cluster-related userdata scripts executable only when graphdb_node_count is greater than 1.
+* Removed hardcoded values from the userdata scripts.
+* Changed the availability tests http_string_type to be calculated based on TLS being enabled.
+
+
 ## 1.0.1
 
 * Updated GraphDB version to [10.6.4](https://graphdb.ontotext.com/documentation/10.6/release-notes.html#graphdb-10-6-4)

diff --git a/README.md b/README.md
@@ -333,6 +333,15 @@ vpc_public_subnet_ids = ["public-subnet-1","public-subnet2","public-subnet-3"]
 vpc_private_subnet_ids = ["private-subnet-1","private-subnet-2","private-subnet-3"]
 ```
 
+## Single Node Deployment
+
+This Terraform module has the ability to deploy a single instance of GraphDB.
+To deploy a single instance you just need to set `graphdb_node_count` to 1, everything else happens automatically.
+
+**Important:** While scaling from a single node deployment to a cluster (e.g., from 1 node to 3 nodes) is possible,
+it is not recommended. Synchronizing the repository across all nodes can be time-consuming,
+potentially causing scripts to time out.
+
 ## Updating configurations on an active deployment
 
 ### Updating Configurations

diff --git a/main.tf b/main.tf
@@ -2,6 +2,17 @@ data "aws_region" "current" {}
 
 data "aws_caller_identity" "current" {}
 
+
+locals {
+  # Reduce to one subnet if node_count is 1
+  effective_private_subnet_cidrs = var.graphdb_node_count == 1 ? [var.vpc_private_subnet_cidrs[0]] : var.vpc_private_subnet_cidrs
+  effective_public_subnet_cidrs  = var.graphdb_node_count == 1 ? [var.vpc_public_subnet_cidrs[0]] : var.vpc_public_subnet_cidrs
+  # Determine the appropriate subnets based on node_count
+  lb_subnets = var.graphdb_node_count == 1 ? (var.vpc_id == "" ? (var.lb_internal ? [module.vpc[0].private_subnet_ids[0]] : [module.vpc[0].public_subnet_ids[0]]) : (var.lb_internal ? [var.vpc_private_subnet_ids[0]] : [var.vpc_public_subnet_ids[0]])) : (var.vpc_id == "" ? (var.lb_internal ? module.vpc[0].private_subnet_ids : module.vpc[0].public_subnet_ids) : (var.lb_internal ? var.vpc_private_subnet_ids : var.vpc_public_subnet_ids))
+  # Check if node_count is 1 and select only one subnet if true
+  graphdb_subnets = var.graphdb_node_count == 1 ? [(var.vpc_id != "" ? var.vpc_private_subnet_ids : module.vpc[0].private_subnet_ids)[0]] : (var.vpc_id != "" ? var.vpc_private_subnet_ids : module.vpc[0].private_subnet_ids)
+}
+
 module "vpc" {
   source = "./modules/vpc"
 
@@ -10,8 +21,8 @@ module "vpc" {
   resource_name_prefix                            = var.resource_name_prefix
   vpc_dns_hostnames                               = var.vpc_dns_hostnames
   vpc_dns_support                                 = var.vpc_dns_support
-  vpc_private_subnet_cidrs                        = var.vpc_private_subnet_cidrs
-  vpc_public_subnet_cidrs                         = var.vpc_public_subnet_cidrs
+  vpc_private_subnet_cidrs                        = local.effective_private_subnet_cidrs
+  vpc_public_subnet_cidrs                         = local.effective_public_subnet_cidrs
   vpc_cidr_block                                  = var.vpc_cidr_block
   single_nat_gateway                              = var.single_nat_gateway
   enable_nat_gateway                              = var.enable_nat_gateway
@@ -21,6 +32,7 @@ module "vpc" {
   vpc_endpoint_service_accept_connection_requests = var.vpc_endpoint_service_accept_connection_requests
   vpc_enable_flow_logs                            = var.vpc_enable_flow_logs
   vpc_flow_log_bucket_arn                         = var.vpc_enable_flow_logs && var.deploy_logging_module ? module.logging[0].graphdb_logging_bucket_arn : null
+  graphdb_node_count                              = var.graphdb_node_count
 }
 
 module "backup" {
@@ -93,7 +105,7 @@ module "load_balancer" {
 
   resource_name_prefix          = var.resource_name_prefix
   vpc_id                        = var.vpc_id != "" ? var.vpc_id : module.vpc[0].vpc_id
-  lb_subnets                    = var.vpc_id == "" ? (var.lb_internal ? module.vpc[0].private_subnet_ids : module.vpc[0].public_subnet_ids) : (var.lb_internal ? var.vpc_private_subnet_ids : var.vpc_public_subnet_ids)
+  lb_subnets                    = local.lb_subnets
   lb_internal                   = var.lb_internal
   lb_deregistration_delay       = var.lb_deregistration_delay
   lb_health_check_path          = var.lb_health_check_path
@@ -103,6 +115,7 @@ module "load_balancer" {
   lb_tls_policy                 = var.lb_tls_policy
   lb_access_logs_bucket_name    = var.lb_enable_access_logs && var.deploy_logging_module ? module.logging[0].graphdb_logging_bucket_name : null
   lb_enable_access_logs         = var.lb_enable_access_logs
+  graphdb_node_count            = var.graphdb_node_count
 }
 
 locals {
@@ -111,6 +124,11 @@ locals {
   )
 }
 
+locals {
+  lb_tls_enabled              = var.lb_tls_certificate_arn != null ? true : false
+  calculated_http_string_type = local.lb_tls_enabled == true ? "HTTPS" : "HTTP"
+}
+
 module "monitoring" {
   source = "./modules/monitoring"
   providers = {
@@ -129,6 +147,8 @@ module "monitoring" {
   cloudwatch_log_group_retention_in_days = var.monitoring_log_group_retention_in_days
   route53_availability_request_url       = module.load_balancer.lb_dns_name
   route53_availability_measure_latency   = var.monitoring_route53_measure_latency
+  graphdb_node_count                     = var.graphdb_node_count
+  route53_availability_http_string_type  = local.calculated_http_string_type
 }
 
 module "graphdb" {
@@ -142,13 +162,13 @@ module "graphdb" {
 
   allowed_inbound_cidrs     = var.allowed_inbound_cidrs_lb
   allowed_inbound_cidrs_ssh = var.allowed_inbound_cidrs_ssh
-  graphdb_subnets           = var.vpc_id != "" ? var.vpc_private_subnet_ids : module.vpc[0].private_subnet_ids
+  graphdb_subnets           = local.graphdb_subnets
   graphdb_target_group_arns = local.graphdb_target_group_arns
   vpc_id                    = var.vpc_id != "" ? var.vpc_id : module.vpc[0].vpc_id
 
   # Network Load Balancer
   lb_enable_private_access = var.lb_internal ? var.lb_enable_private_access : false
-  lb_subnets               = var.vpc_id == "" ? (var.lb_internal ? module.vpc[0].private_subnet_ids : module.vpc[0].public_subnet_ids) : (var.lb_internal ? var.vpc_private_subnet_ids : var.vpc_public_subnet_ids)
+  lb_subnets               = local.lb_subnets
   graphdb_lb_dns_name      = module.load_balancer.lb_dns_name
 
   # GraphDB Configurations

diff --git a/modules/graphdb/nsg.tf b/modules/graphdb/nsg.tf
@@ -8,13 +8,15 @@ resource "aws_security_group_rule" "graphdb_internal_http" {
   description       = "Allow GraphDB proxies and nodes to communicate (HTTP)."
   security_group_id = aws_security_group.graphdb_security_group.id
   type              = "ingress"
-  from_port         = 7200
+  from_port         = var.graphdb_node_count == 1 ? 7201 : 7200
   to_port           = 7201
   protocol          = "tcp"
   cidr_blocks       = local.subnet_cidr_blocks
 }
 
 resource "aws_security_group_rule" "graphdb_internal_raft" {
+  count = var.graphdb_node_count != 1 ? 1 : 0
+
   description       = "Allow GraphDB proxies and nodes to communicate (Raft)."
   security_group_id = aws_security_group.graphdb_security_group.id
   type              = "ingress"
@@ -25,7 +27,8 @@ resource "aws_security_group_rule" "graphdb_internal_raft" {
 }
 
 resource "aws_security_group_rule" "graphdb_ssh_inbound" {
-  count             = var.allowed_inbound_cidrs_ssh != null ? 1 : 0
+  count = var.allowed_inbound_cidrs_ssh != null ? 1 : 0
+
   description       = "Allow specified CIDRs SSH access to the GraphDB instances."
   security_group_id = aws_security_group.graphdb_security_group.id
   type              = "ingress"
@@ -51,8 +54,8 @@ resource "aws_security_group_rule" "graphdb_network_lb_ingress" {
   description       = "CIRDs allowed to access GraphDB."
   security_group_id = aws_security_group.graphdb_security_group.id
   type              = "ingress"
-  from_port         = 7200
-  to_port           = 7200
+  from_port         = var.graphdb_node_count == 1 ? 7201 : 7200
+  to_port           = var.graphdb_node_count == 1 ? 7201 : 7200
   protocol          = "tcp"
   cidr_blocks       = var.allowed_inbound_cidrs
 }
@@ -64,7 +67,7 @@ resource "aws_security_group_rule" "graphdb_lb_healthchecks" {
   description       = "Allow the load balancer to healthcheck the GraphDB nodes and access the proxies."
   security_group_id = aws_security_group.graphdb_security_group.id
   type              = "ingress"
-  from_port         = 7200
+  from_port         = var.graphdb_node_count == 1 ? 7201 : 7200
   to_port           = 7201
   protocol          = "tcp"
   cidr_blocks       = local.lb_subnet_cidr_blocks

diff --git a/modules/graphdb/templates/05_gdb_backup_conf.sh.tpl b/modules/graphdb/templates/05_gdb_backup_conf.sh.tpl
@@ -20,18 +20,16 @@ if [ ${deploy_backup} == "true" ]; then
   cat <<-EOF >/usr/bin/graphdb_backup
 #!/bin/bash
 
-set -euxo pipefail
+set -euo pipefail
 
 GRAPHDB_ADMIN_PASSWORD="\$(aws --cli-connect-timeout 300 ssm get-parameter --region ${region} --name "/${name}/graphdb/admin_password" --with-decryption | jq -r .Parameter.Value | base64 -d)"
-NODE_STATE="\$(curl --silent --fail --user "admin:\$GRAPHDB_ADMIN_PASSWORD" localhost:7201/rest/cluster/node/status | jq -r .nodeState)"
-
-if [ "\$NODE_STATE" != "LEADER" ]; then
-  echo "current node is not a leader, but \$NODE_STATE"
-  exit 0
-fi
+NODE_STATE="\$(curl --silent -u "admin:\$GRAPHDB_ADMIN_PASSWORD" http://localhost:7201/rest/cluster/node/status | jq -r .nodeState)"
 
 function trigger_backup {
   local backup_name="\$(date +'%Y-%m-%d_%H-%M-%S').tar"
+  current_time=$(date +"%T %Y-%m-%d")
+  start_time=$(date +%s)
+  echo "Creating backup $backup_name at $start_time"
 
   curl \
     -vvv --fail \
@@ -59,9 +57,23 @@ function rotate_backups {
   done
 }
 
-if ! trigger_backup; then
-  echo "failed to create backup"
-  exit 1
+# Checks if GraphDB is running in cluster
+IS_CLUSTER=\$(
+  curl -s -o /dev/null \
+    -u "admin:\$GRAPHDB_ADMIN_PASSWORD" \
+    -w "%%{http_code}" \
+    http://localhost:7200/rest/monitor/cluster
+)
+
+if [ "\$IS_CLUSTER" == 200 ]; then
+  # Checks if the current GraphDB instance is Leader, otherwise exits.
+  if [ "\$NODE_STATE" != "LEADER" ]; then
+    echo "current node is not a leader, but \$NODE_STATE"
+    exit 0
+  fi
+  (trigger_backup && echo "") | tee -a /var/opt/graphdb/node/graphdb_backup.log
+elif [ "\$IS_CLUSTER" == 503 ]; then
+  (trigger_backup && echo "") | tee -a /var/opt/graphdb/node/graphdb_backup.log
 fi
 
 rotate_backups

diff --git a/modules/graphdb/templates/08_cluster_setup.sh.tpl b/modules/graphdb/templates/08_cluster_setup.sh.tpl
@@ -45,7 +45,7 @@ fi
 
 # Function which waits for all DNS records to be created
 wait_dns_records() {
-  local all_dns_records=($(aws route53 list-resource-record-sets --hosted-zone-id "${zone_id}" --query "ResourceRecordSets[?contains(Name, '.graphdb.cluster') == \`true\`].Name" --output text))
+  local all_dns_records=($(aws route53 list-resource-record-sets --hosted-zone-id "${zone_id}" --query "ResourceRecordSets[?contains(Name, '.${route53_zone_dns_name}') == \`true\`].Name" --output text))
   local all_dns_records_count="$${#all_dns_records[@]}"
 
   if [ "$${all_dns_records_count}" -ne $${NODE_COUNT} ]; then
@@ -77,7 +77,7 @@ check_gdb() {
 wait_dns_records
 
 # Existing records are returned with . at the end
-EXISTING_DNS_RECORDS=$(aws route53 list-resource-record-sets --hosted-zone-id "${zone_id}" --query "ResourceRecordSets[?contains(Name, '.graphdb.cluster') == \`true\`].Name")
+EXISTING_DNS_RECORDS=$(aws route53 list-resource-record-sets --hosted-zone-id "${zone_id}" --query "ResourceRecordSets[?contains(Name, '.${route53_zone_dns_name}') == \`true\`].Name")
 # Convert the output into an array
 readarray -t EXISTING_DNS_RECORDS_ARRAY <<<$(echo "$EXISTING_DNS_RECORDS" | jq -r '.[] | rtrimstr(".")')
 # Builds grpc addresses for all nodes registered in Route53
@@ -156,7 +156,8 @@ create_cluster() {
     elif [ "$is_cluster" == 503 ]; then
       # Create the GraphDB cluster configuration if it does not exist.
       local cluster_create=$(
-        curl -X POST -s http://localhost:7201/rest/cluster/config \
+      # TODO update to use node-1
+        curl -X POST -s "http://node-1.${route53_zone_dns_name}:7201/rest/cluster/config" \
           -o "/dev/null" \
           -w "%%{http_code}" \
           -H 'Content-type: application/json' \
@@ -188,7 +189,7 @@ enable_security() {
   # Set the admin password
   local set_password=$(
     curl --location -s -w "%%{http_code}" \
-      --request PATCH 'http://localhost:7200/rest/security/users/admin' \
+      --request PATCH 'http://localhost:7201/rest/security/users/admin' \
       --header 'Content-Type: application/json' \
       --data "{ \"password\": \"$${GRAPHDB_ADMIN_PASSWORD}\" }"
   )
@@ -204,7 +205,7 @@ enable_security() {
     curl -X POST -s -w "%%{http_code}" \
       --header 'Content-Type: application/json' \
       --header 'Accept: */*' \
-      -d 'true' 'http://localhost:7200/rest/security'
+      -d 'true' 'http://localhost:7201/rest/security'
   )
 
   if [[ "$enable_security" == 200 ]]; then
@@ -221,7 +222,7 @@ check_security_status() {
     curl -s -X GET \
       --header 'Accept: application/json' \
       -u "admin:$${GRAPHDB_ADMIN_PASSWORD}" \
-      'http://localhost:7200/rest/security'
+      'http://localhost:7201/rest/security'
   )
 
   # Check if GDB security is enabled

diff --git a/modules/graphdb/templates/09_node_join.sh.tpl b/modules/graphdb/templates/09_node_join.sh.tpl
@@ -25,8 +25,8 @@ CURRENT_NODE_NAME=$(hostname)
 LEADER_NODE=""
 RAFT_DIR="/var/opt/graphdb/node/data/raft"
 
-# Get existing DNS records from Route53 which contain .graphdb.cluster in their name
-EXISTING_RECORDS=$(aws route53 list-resource-record-sets --hosted-zone-id "${zone_id}" --query "ResourceRecordSets[?contains(Name, '.graphdb.cluster') == \`true\`].Name")
+# Get existing DNS records from Route53 which contain .${route53_zone_dns_name} in their name
+EXISTING_RECORDS=$(aws route53 list-resource-record-sets --hosted-zone-id "${zone_id}" --query "ResourceRecordSets[?contains(Name, '.${route53_zone_dns_name}') == \`true\`].Name")
 # Use jq to process the JSON output, remove the last dot from each element, and convert it to an array
 EXISTING_RECORDS=$(echo "$EXISTING_RECORDS" | jq -r '.[] | rtrimstr(".")')
 # Convert the output into an array