From 46ec5f4f8464b209f1248bcbacf89f342c269d8f Mon Sep 17 00:00:00 2001
From: Andrei Ilas <andrei.ilas@oracle.com>
Date: Wed, 3 Jul 2024 08:22:04 +0300
Subject: [PATCH] Add option to ignore size for nodepools and instance pools

---
 .../guide/extensions_cluster_autoscaler.md    |  62 ++++++-
 docs/src/resources.md                         |   2 +-
 .../workers/vars-workers-advanced.auto.tfvars |  34 ++--
 .../vars-workers-autoscaling.auto.tfvars      |  11 +-
 migration.tf                                  |  10 ++
 modules/workers/instancepools.tf              |  64 ++++++-
 modules/workers/locals.tf                     |   5 +-
 modules/workers/nodepools.tf                  | 159 +++++++++++++++++-
 8 files changed, 318 insertions(+), 29 deletions(-)

diff --git a/docs/src/guide/extensions_cluster_autoscaler.md b/docs/src/guide/extensions_cluster_autoscaler.md
index 2ef90cd1..21c563c5 100644
--- a/docs/src/guide/extensions_cluster_autoscaler.md
+++ b/docs/src/guide/extensions_cluster_autoscaler.md
@@ -1,4 +1,4 @@
-# Extensions: Cluster Autoscaler
+# Extensions: Standalone Cluster Autoscaler
 
 Deployed using the [cluster-autoscaler Helm chart](https://github.com/kubernetes/autoscaler/tree/master/charts/cluster-autoscaler) with configuration from the `worker_pools` variable.
 
@@ -13,6 +13,66 @@ The following parameters may be added on each pool definition to enable manageme
 * `min_size`: Define the minimum scale of a pool managed by the cluster autoscaler. Defaults to `size` when not provided.
 * `max_size`: Define the maximum scale of a pool managed by the cluster autoscaler. Defaults to `size` when not provided.
 
+The cluster-autoscaler will manage the size of the nodepools with the attribute `autoscale = true`. To avoid the conflict between the actual `size` of a nodepool and the `size` defined in the terraform configuration files, you can add the `ignore_initial_pool_size = true` attribute to the nodepool definition in the `worker_pools` variable. This parameter will allow terraform to ignore the [drift](https://developer.hashicorp.com/terraform/tutorials/state/resource-drift) of the size parameter for the specific nodepool.
+
+This setting is strongly recommended for nodepools configured with `autoscale = true`.
+
+Example:
+
+```
+worker_pools = {
+  np-autoscaled = {
+    description              = "Node pool managed by cluster autoscaler",
+    size                     = 2,
+    min_size                 = 1,
+    max_size                 = 3,
+    autoscale                = true,
+    ignore_initial_pool_size = true # allows nodepool size drift
+  },
+  np-autoscaler = {
+    description      = "Node pool with cluster autoscaler scheduling allowed",
+    size             = 1,
+    allow_autoscaler = true,
+  },
+}
+
+```
+
+
+For existing deployments is necessary to use the [terraform state mv](https://developer.hashicorp.com/terraform/cli/commands/state/mv) command.
+
+Example for `nodepool` resource:
+```
+
+$ terraform plan
+...
+Terraform will perform the following actions:
+  
+  # module.oke.module.workers[0].oci_containerengine_node_pool.tfscaled_workers["np-autoscaled"] will be destroyed
+...
+
+  # module.oke.module.workers[0].oci_containerengine_node_pool.autoscaled_workers["np-autoscaled"] will be created
+
+
+$ terraform state mv module.oke.module.workers[0].oci_containerengine_node_pool.tfscaled_workers[\"np-autoscaled\"]  module.oke.module.workers[0].oci_containerengine_node_pool.autoscaled_workers[\"np-autoscaled\"]
+
+Successfully moved 1 object(s).
+
+$ terraform plan
+...
+No changes. Your infrastructure matches the configuration.
+
+```
+
+Example for `instance_pool` resource:
+
+```
+$ terraform state mv module.oke.module.workers[0].oci_core_instance_pool.tfscaled_workers[\"np-autoscaled\"] module.oke.module.workers[0].oci_core_instance_pool.autoscaled_workers[\"np-autoscaled\"]
+
+Successfully moved 1 object(s).
+
+```
+
 ### Notes
 
 Don't set `allow_autoscaler` and `autoscale` to `true` on the same pool. This will cause the cluster autoscaler pod to be unschedulable as the `oke.oraclecloud.com/cluster_autoscaler: managed` node label will override the `oke.oraclecloud.com/cluster_autoscaler: allowed` node label specified by the cluster autoscaler `nodeSelector` pod attribute.
diff --git a/docs/src/resources.md b/docs/src/resources.md
index 37cc61ae..70f8d0f4 100644
--- a/docs/src/resources.md
+++ b/docs/src/resources.md
@@ -53,7 +53,7 @@
 ## Workers
 <!-- BEGIN_TF_WORKERS -->
 
-* [oci_containerengine_node_pool.workers](https://registry.terraform.io/providers/oracle/oci/latest/docs/resources/containerengine_node_pool)
+* [oci_containerengine_node_pool.tfscaled_workers](https://registry.terraform.io/providers/oracle/oci/latest/docs/resources/containerengine_node_pool)
 * [oci_containerengine_virtual_node_pool.workers](https://registry.terraform.io/providers/oracle/oci/latest/docs/resources/containerengine_virtual_node_pool)
 * [oci_core_cluster_network.workers](https://registry.terraform.io/providers/oracle/oci/latest/docs/resources/core_cluster_network)
 * [oci_core_instance.workers](https://registry.terraform.io/providers/oracle/oci/latest/docs/resources/core_instance)
diff --git a/examples/workers/vars-workers-advanced.auto.tfvars b/examples/workers/vars-workers-advanced.auto.tfvars
index f2e2cd51..8ff0bebe 100644
--- a/examples/workers/vars-workers-advanced.auto.tfvars
+++ b/examples/workers/vars-workers-advanced.auto.tfvars
@@ -25,24 +25,26 @@ worker_pools = {
     create = false
   },
   wg_np-vm-ol7 = {
-    description = "OKE-managed Node Pool with OKE Oracle Linux 7 image",
-    create      = false,
-    mode        = "node-pool",
-    size        = 1,
-    size_max    = 2,
-    os          = "Oracle Linux",
-    os_version  = "7",
-    autoscale   = true,
+    description              = "OKE-managed Node Pool with OKE Oracle Linux 7 image",
+    create                   = false,
+    mode                     = "node-pool",
+    size                     = 1,
+    size_max                 = 2,
+    os                       = "Oracle Linux",
+    os_version               = "7",
+    autoscale                = true,
+    ignore_initial_pool_size = true
   },
   wg_np-vm-ol8 = {
-    description = "OKE-managed Node Pool with OKE Oracle Linux 8 image",
-    create      = false,
-    mode        = "node-pool",
-    size        = 1,
-    size_max    = 3,
-    os          = "Oracle Linux",
-    os_version  = "8",
-    autoscale   = true,
+    description              = "OKE-managed Node Pool with OKE Oracle Linux 8 image",
+    create                   = false,
+    mode                     = "node-pool",
+    size                     = 1,
+    size_max                 = 3,
+    os                       = "Oracle Linux",
+    os_version               = "8",
+    autoscale                = true,
+    ignore_initial_pool_size = true
   },
   wg_np-vm-custom = {
     description      = "OKE-managed Node Pool with custom image",
diff --git a/examples/workers/vars-workers-autoscaling.auto.tfvars b/examples/workers/vars-workers-autoscaling.auto.tfvars
index 965dea1b..db1de473 100644
--- a/examples/workers/vars-workers-autoscaling.auto.tfvars
+++ b/examples/workers/vars-workers-autoscaling.auto.tfvars
@@ -5,11 +5,12 @@
 
 worker_pools = {
   np-autoscaled = {
-    description = "Node pool managed by cluster autoscaler",
-    size        = 2,
-    min_size    = 1,
-    max_size    = 3,
-    autoscale   = true,
+    description              = "Node pool managed by cluster autoscaler",
+    size                     = 2,
+    min_size                 = 1,
+    max_size                 = 3,
+    autoscale                = true,
+    ignore_initial_pool_size = true
   },
   np-autoscaler = {
     description      = "Node pool with cluster autoscaler scheduling allowed",
diff --git a/migration.tf b/migration.tf
index d61d4e95..9f0f7ca5 100644
--- a/migration.tf
+++ b/migration.tf
@@ -49,3 +49,13 @@ moved {
   from = module.oke.oci_containerengine_node_pool.nodepools
   to   = module.workers[0].oci_containerengine_node_pool.workers
 }
+
+moved {
+  from = module.workers[0].oci_containerengine_node_pool.workers
+  to   = module.workers[0].oci_containerengine_node_pool.tfscaled_workers
+}
+
+moved {
+  from = module.workers[0].oci_core_instance_pool.workers
+  to   = module.workers[0].oci_core_instance_pool.tfscaled_workers
+}
\ No newline at end of file
diff --git a/modules/workers/instancepools.tf b/modules/workers/instancepools.tf
index 36c0654c..1c7df68b 100644
--- a/modules/workers/instancepools.tf
+++ b/modules/workers/instancepools.tf
@@ -2,9 +2,9 @@
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl
 
 # Dynamic resource block for Instance Pool groups defined in worker_pools
-resource "oci_core_instance_pool" "workers" {
+resource "oci_core_instance_pool" "tfscaled_workers" {
   # Create an OCI Instance Pool resource for each enabled entry of the worker_pools map with that mode.
-  for_each                  = local.enabled_instance_pools
+  for_each                  = { for key, value in local.enabled_instance_pools: key => value if tobool(lookup(value, "ignore_initial_pool_size", false)) == false }
   compartment_id            = each.value.compartment_id
   display_name              = each.key
   size                      = each.value.size
@@ -61,3 +61,63 @@ resource "oci_core_instance_pool" "workers" {
     }
   }
 }
+
+resource "oci_core_instance_pool" "autoscaled_workers" {
+  # Create an OCI Instance Pool resource for each enabled entry of the worker_pools map with that mode.
+  for_each                  = { for key, value in local.enabled_instance_pools: key => value if tobool(lookup(value, "ignore_initial_pool_size", false)) == true }
+  compartment_id            = each.value.compartment_id
+  display_name              = each.key
+  size                      = each.value.size
+  instance_configuration_id = oci_core_instance_configuration.workers[each.key].id
+  defined_tags              = each.value.defined_tags
+  freeform_tags             = each.value.freeform_tags
+
+  dynamic "placement_configurations" {
+    for_each = each.value.availability_domains
+    iterator = ad
+
+    content {
+      availability_domain = ad.value
+      primary_subnet_id   = each.value.subnet_id
+
+      # Value(s) specified on pool, or null to select automatically
+      fault_domains = try(each.value.placement_fds, null)
+
+      dynamic "secondary_vnic_subnets" {
+        for_each = lookup(each.value, "secondary_vnics", {})
+        iterator = vnic
+        content {
+          display_name = vnic.key
+          subnet_id    = lookup(vnic.value, "subnet_id", each.value.subnet_id)
+        }
+      }
+    }
+  }
+
+  lifecycle {
+    ignore_changes = [
+      display_name, defined_tags, freeform_tags,
+      placement_configurations, size
+    ]
+
+    precondition {
+      condition     = coalesce(each.value.image_id, "none") != "none"
+      error_message = <<-EOT
+      Missing image_id; check provided value if image_type is 'custom', or image_os/image_os_version if image_type is 'oke' or 'platform'.
+        pool: ${each.key}
+        image_type: ${coalesce(each.value.image_type, "none")}
+        image_id: ${coalesce(each.value.image_id, "none")}
+      EOT
+    }
+
+    precondition {
+      condition     = var.cni_type == "flannel"
+      error_message = "Instance Pools require a cluster with `cni_type = flannel`."
+    }
+
+    precondition {
+      condition     = each.value.autoscale == false
+      error_message = "Instance Pools do not support cluster autoscaler management."
+    }
+  }
+}
diff --git a/modules/workers/locals.tf b/modules/workers/locals.tf
index f239e12f..85ddf45e 100644
--- a/modules/workers/locals.tf
+++ b/modules/workers/locals.tf
@@ -36,6 +36,7 @@ locals {
     eviction_grace_duration      = 300
     force_node_delete            = true
     extended_metadata            = {} # empty pool-specific default
+    ignore_initial_pool_size     = false
     image_id                     = var.image_id
     image_type                   = var.image_type
     kubernetes_version           = var.kubernetes_version
@@ -231,9 +232,9 @@ locals {
   }
 
   # Maps of worker pool OCI resources by pool name enriched with desired/custom parameters for various modes
-  worker_node_pools         = { for k, v in oci_containerengine_node_pool.workers : k => merge(v, lookup(local.worker_pools_final, k, {})) }
+  worker_node_pools         = { for k, v in merge(oci_containerengine_node_pool.tfscaled_workers, oci_containerengine_node_pool.autoscaled_workers) : k => merge(v, lookup(local.worker_pools_final, k, {})) }
   worker_virtual_node_pools = { for k, v in oci_containerengine_virtual_node_pool.workers : k => merge(v, lookup(local.worker_pools_final, k, {})) }
-  worker_instance_pools     = { for k, v in oci_core_instance_pool.workers : k => merge(v, lookup(local.worker_pools_final, k, {})) }
+  worker_instance_pools     = { for k, v in merge(oci_core_instance_pool.tfscaled_workers, oci_core_instance_pool.autoscaled_workers) : k => merge(v, lookup(local.worker_pools_final, k, {})) }
   worker_cluster_networks   = { for k, v in oci_core_cluster_network.workers : k => merge(v, lookup(local.worker_pools_final, k, {})) }
   worker_instances          = { for k, v in oci_core_instance.workers : k => merge(v, lookup(local.worker_pools_final, k, {})) }
 
diff --git a/modules/workers/nodepools.tf b/modules/workers/nodepools.tf
index bcc3f3df..7d01ddad 100644
--- a/modules/workers/nodepools.tf
+++ b/modules/workers/nodepools.tf
@@ -2,9 +2,9 @@
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl
 
 # Dynamic resource block for Node Pool groups defined in worker_pools
-resource "oci_containerengine_node_pool" "workers" {
+resource "oci_containerengine_node_pool" "tfscaled_workers" {
   # Create an OKE node pool resource for each enabled entry of the worker_pools map with that mode.
-  for_each           = local.enabled_node_pools
+  for_each           = { for key, value in local.enabled_node_pools: key => value if tobool(lookup(value, "ignore_initial_pool_size", false)) == false }
   cluster_id         = var.cluster_id
   compartment_id     = each.value.compartment_id
   defined_tags       = each.value.defined_tags
@@ -156,3 +156,158 @@ resource "oci_containerengine_node_pool" "workers" {
     }
   }
 }
+
+resource "oci_containerengine_node_pool" "autoscaled_workers" {
+  # Create an OKE node pool resource for each enabled entry of the worker_pools map with that mode.
+  for_each           = { for key, value in local.enabled_node_pools: key => value if tobool(lookup(value, "ignore_initial_pool_size", false)) == true }
+  cluster_id         = var.cluster_id
+  compartment_id     = each.value.compartment_id
+  defined_tags       = each.value.defined_tags
+  freeform_tags      = each.value.freeform_tags
+  kubernetes_version = each.value.kubernetes_version
+  name               = each.key
+  node_shape         = each.value.shape
+  ssh_public_key     = var.ssh_public_key
+
+  node_config_details {
+    size                                = each.value.size
+    is_pv_encryption_in_transit_enabled = each.value.pv_transit_encryption
+    kms_key_id                          = each.value.volume_kms_key_id
+    nsg_ids                             = each.value.nsg_ids
+    defined_tags                        = each.value.defined_tags
+    freeform_tags                       = each.value.freeform_tags
+
+    dynamic "placement_configs" {
+      for_each = each.value.availability_domains
+      iterator = ad
+
+      content {
+        availability_domain     = ad.value
+        capacity_reservation_id = each.value.capacity_reservation_id
+        subnet_id               = each.value.subnet_id
+
+        # Value(s) specified on pool, or null to select automatically
+        fault_domains = try(each.value.placement_fds, null)
+
+        dynamic "preemptible_node_config" {
+          for_each = each.value.preemptible_config.enable ? [1] : []
+          content {
+            preemption_action {
+              type                    = "TERMINATE"
+              is_preserve_boot_volume = each.value.preemptible_config.is_preserve_boot_volume
+            }
+          }
+        }
+      }
+    }
+
+    dynamic "node_pool_pod_network_option_details" {
+      for_each = var.cni_type == "flannel" ? [1] : []
+      content { # Flannel requires cni type only
+        cni_type = "FLANNEL_OVERLAY"
+      }
+    }
+
+    dynamic "node_pool_pod_network_option_details" {
+      for_each = var.cni_type == "npn" ? [1] : []
+      content { # VCN-Native requires max pods/node, nsg ids, subnet ids
+        cni_type          = "OCI_VCN_IP_NATIVE"
+        max_pods_per_node = each.value.max_pods_per_node
+        pod_nsg_ids       = compact(tolist(each.value.pod_nsg_ids))
+        pod_subnet_ids    = compact(tolist([each.value.pod_subnet_id]))
+      }
+    }
+  }
+
+  node_metadata = merge(
+    {
+      apiserver_host           = var.apiserver_private_host
+      oke-kubeproxy-proxy-mode = var.kubeproxy_mode
+      user_data                = lookup(lookup(data.cloudinit_config.workers, each.key, {}), "rendered", "")
+    },
+
+    # Only provide cluster DNS service address if set explicitly; determined automatically in practice.
+    coalesce(var.cluster_dns, "none") == "none" ? {} : { kubedns_svc_ip = var.cluster_dns },
+
+    # Extra user-defined fields merged last
+    var.node_metadata,                       # global
+    lookup(each.value, "node_metadata", {}), # pool-specific
+  )
+
+  node_eviction_node_pool_settings {
+    eviction_grace_duration = (floor(tonumber(each.value.eviction_grace_duration) / 60) > 0 ?
+      (each.value.eviction_grace_duration > 3600 ?
+        format("PT%dM", 60) :
+        (each.value.eviction_grace_duration % 60 == 0 ?
+          format("PT%dM", floor(each.value.eviction_grace_duration / 60)) :
+          format("PT%dM%dS", floor(each.value.eviction_grace_duration / 60), each.value.eviction_grace_duration % 60)
+        )
+      ) :
+      format("PT%dS", each.value.eviction_grace_duration)
+    )
+    is_force_delete_after_grace_duration = tobool(each.value.force_node_delete)
+  }
+
+  dynamic "node_shape_config" {
+    for_each = length(regexall("Flex", each.value.shape)) > 0 ? [1] : []
+    content {
+      ocpus = each.value.ocpus
+      memory_in_gbs = ( # If > 64GB memory/core, correct input to exactly 64GB memory/core
+        (each.value.memory / each.value.ocpus) > 64 ? each.value.ocpus * 64 : each.value.memory
+      )
+    }
+  }
+
+  node_pool_cycling_details {
+    is_node_cycling_enabled = each.value.node_cycling_enabled
+    maximum_surge           = each.value.node_cycling_max_surge
+    maximum_unavailable     = each.value.node_cycling_max_unavailable
+  }
+
+  node_source_details {
+    boot_volume_size_in_gbs = each.value.boot_volume_size
+    image_id                = each.value.image_id
+    source_type             = "image"
+  }
+
+  lifecycle { # prevent resources changes for changed fields
+    ignore_changes = [
+      # kubernetes_version, # e.g. if changed as part of an upgrade
+      name, defined_tags, freeform_tags,
+      node_metadata["user_data"],               # templated cloud-init
+      node_config_details[0].placement_configs, # dynamic placement configs
+      node_config_details[0].size               # size
+    ]
+
+    precondition {
+      condition     = coalesce(each.value.image_id, "none") != "none"
+      error_message = <<-EOT
+      Missing image_id; check provided value if image_type is 'custom', or image_os/image_os_version if image_type is 'oke' or 'platform'.
+        pool: ${each.key}
+        image_type: ${coalesce(each.value.image_type, "none")}
+        image_id: ${coalesce(each.value.image_id, "none")}
+      EOT
+    }
+
+    precondition {
+      condition = anytrue([
+        contains(["instance-pool", "cluster-network"], each.value.mode), # supported modes
+        length(lookup(each.value, "secondary_vnics", {})) == 0,          # unrestricted when empty/unset
+      ])
+      error_message = "Unsupported option for mode=${each.value.mode}: secondary_vnics"
+    }
+
+    precondition {
+      condition     = coalesce(each.value.capacity_reservation_id, "none") == "none" || length(each.value.availability_domains) == 1
+      error_message = "A single availability domain must be specified when using a capacity reservation with mode=${each.value.mode}"
+    }
+  }
+
+  dynamic "initial_node_labels" {
+    for_each = each.value.node_labels
+    content {
+      key   = initial_node_labels.key
+      value = initial_node_labels.value
+    }
+  }
+}