diff --git a/docs/src/guide/extensions_cluster_autoscaler.md b/docs/src/guide/extensions_cluster_autoscaler.md index 2ef90cd1..21c563c5 100644 --- a/docs/src/guide/extensions_cluster_autoscaler.md +++ b/docs/src/guide/extensions_cluster_autoscaler.md @@ -1,4 +1,4 @@ -# Extensions: Cluster Autoscaler +# Extensions: Standalone Cluster Autoscaler Deployed using the [cluster-autoscaler Helm chart](https://github.com/kubernetes/autoscaler/tree/master/charts/cluster-autoscaler) with configuration from the `worker_pools` variable. @@ -13,6 +13,66 @@ The following parameters may be added on each pool definition to enable manageme * `min_size`: Define the minimum scale of a pool managed by the cluster autoscaler. Defaults to `size` when not provided. * `max_size`: Define the maximum scale of a pool managed by the cluster autoscaler. Defaults to `size` when not provided. +The cluster-autoscaler will manage the size of the nodepools with the attribute `autoscale = true`. To avoid the conflict between the actual `size` of a nodepool and the `size` defined in the terraform configuration files, you can add the `ignore_initial_pool_size = true` attribute to the nodepool definition in the `worker_pools` variable. This parameter will allow terraform to ignore the [drift](https://developer.hashicorp.com/terraform/tutorials/state/resource-drift) of the size parameter for the specific nodepool. + +This setting is strongly recommended for nodepools configured with `autoscale = true`. + +Example: + +``` +worker_pools = { + np-autoscaled = { + description = "Node pool managed by cluster autoscaler", + size = 2, + min_size = 1, + max_size = 3, + autoscale = true, + ignore_initial_pool_size = true # allows nodepool size drift + }, + np-autoscaler = { + description = "Node pool with cluster autoscaler scheduling allowed", + size = 1, + allow_autoscaler = true, + }, +} + +``` + + +For existing deployments is necessary to use the [terraform state mv](https://developer.hashicorp.com/terraform/cli/commands/state/mv) command. + +Example for `nodepool` resource: +``` + +$ terraform plan +... +Terraform will perform the following actions: + + # module.oke.module.workers[0].oci_containerengine_node_pool.tfscaled_workers["np-autoscaled"] will be destroyed +... + + # module.oke.module.workers[0].oci_containerengine_node_pool.autoscaled_workers["np-autoscaled"] will be created + + +$ terraform state mv module.oke.module.workers[0].oci_containerengine_node_pool.tfscaled_workers[\"np-autoscaled\"] module.oke.module.workers[0].oci_containerengine_node_pool.autoscaled_workers[\"np-autoscaled\"] + +Successfully moved 1 object(s). + +$ terraform plan +... +No changes. Your infrastructure matches the configuration. + +``` + +Example for `instance_pool` resource: + +``` +$ terraform state mv module.oke.module.workers[0].oci_core_instance_pool.tfscaled_workers[\"np-autoscaled\"] module.oke.module.workers[0].oci_core_instance_pool.autoscaled_workers[\"np-autoscaled\"] + +Successfully moved 1 object(s). + +``` + ### Notes Don't set `allow_autoscaler` and `autoscale` to `true` on the same pool. This will cause the cluster autoscaler pod to be unschedulable as the `oke.oraclecloud.com/cluster_autoscaler: managed` node label will override the `oke.oraclecloud.com/cluster_autoscaler: allowed` node label specified by the cluster autoscaler `nodeSelector` pod attribute. diff --git a/docs/src/resources.md b/docs/src/resources.md index 37cc61ae..70f8d0f4 100644 --- a/docs/src/resources.md +++ b/docs/src/resources.md @@ -53,7 +53,7 @@ ## Workers -* [oci_containerengine_node_pool.workers](https://registry.terraform.io/providers/oracle/oci/latest/docs/resources/containerengine_node_pool) +* [oci_containerengine_node_pool.tfscaled_workers](https://registry.terraform.io/providers/oracle/oci/latest/docs/resources/containerengine_node_pool) * [oci_containerengine_virtual_node_pool.workers](https://registry.terraform.io/providers/oracle/oci/latest/docs/resources/containerengine_virtual_node_pool) * [oci_core_cluster_network.workers](https://registry.terraform.io/providers/oracle/oci/latest/docs/resources/core_cluster_network) * [oci_core_instance.workers](https://registry.terraform.io/providers/oracle/oci/latest/docs/resources/core_instance) diff --git a/examples/workers/vars-workers-advanced.auto.tfvars b/examples/workers/vars-workers-advanced.auto.tfvars index f2e2cd51..8ff0bebe 100644 --- a/examples/workers/vars-workers-advanced.auto.tfvars +++ b/examples/workers/vars-workers-advanced.auto.tfvars @@ -25,24 +25,26 @@ worker_pools = { create = false }, wg_np-vm-ol7 = { - description = "OKE-managed Node Pool with OKE Oracle Linux 7 image", - create = false, - mode = "node-pool", - size = 1, - size_max = 2, - os = "Oracle Linux", - os_version = "7", - autoscale = true, + description = "OKE-managed Node Pool with OKE Oracle Linux 7 image", + create = false, + mode = "node-pool", + size = 1, + size_max = 2, + os = "Oracle Linux", + os_version = "7", + autoscale = true, + ignore_initial_pool_size = true }, wg_np-vm-ol8 = { - description = "OKE-managed Node Pool with OKE Oracle Linux 8 image", - create = false, - mode = "node-pool", - size = 1, - size_max = 3, - os = "Oracle Linux", - os_version = "8", - autoscale = true, + description = "OKE-managed Node Pool with OKE Oracle Linux 8 image", + create = false, + mode = "node-pool", + size = 1, + size_max = 3, + os = "Oracle Linux", + os_version = "8", + autoscale = true, + ignore_initial_pool_size = true }, wg_np-vm-custom = { description = "OKE-managed Node Pool with custom image", diff --git a/examples/workers/vars-workers-autoscaling.auto.tfvars b/examples/workers/vars-workers-autoscaling.auto.tfvars index 965dea1b..db1de473 100644 --- a/examples/workers/vars-workers-autoscaling.auto.tfvars +++ b/examples/workers/vars-workers-autoscaling.auto.tfvars @@ -5,11 +5,12 @@ worker_pools = { np-autoscaled = { - description = "Node pool managed by cluster autoscaler", - size = 2, - min_size = 1, - max_size = 3, - autoscale = true, + description = "Node pool managed by cluster autoscaler", + size = 2, + min_size = 1, + max_size = 3, + autoscale = true, + ignore_initial_pool_size = true }, np-autoscaler = { description = "Node pool with cluster autoscaler scheduling allowed", diff --git a/migration.tf b/migration.tf index d61d4e95..9f0f7ca5 100644 --- a/migration.tf +++ b/migration.tf @@ -49,3 +49,13 @@ moved { from = module.oke.oci_containerengine_node_pool.nodepools to = module.workers[0].oci_containerengine_node_pool.workers } + +moved { + from = module.workers[0].oci_containerengine_node_pool.workers + to = module.workers[0].oci_containerengine_node_pool.tfscaled_workers +} + +moved { + from = module.workers[0].oci_core_instance_pool.workers + to = module.workers[0].oci_core_instance_pool.tfscaled_workers +} \ No newline at end of file diff --git a/modules/workers/instancepools.tf b/modules/workers/instancepools.tf index 36c0654c..1c7df68b 100644 --- a/modules/workers/instancepools.tf +++ b/modules/workers/instancepools.tf @@ -2,9 +2,9 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl # Dynamic resource block for Instance Pool groups defined in worker_pools -resource "oci_core_instance_pool" "workers" { +resource "oci_core_instance_pool" "tfscaled_workers" { # Create an OCI Instance Pool resource for each enabled entry of the worker_pools map with that mode. - for_each = local.enabled_instance_pools + for_each = { for key, value in local.enabled_instance_pools: key => value if tobool(lookup(value, "ignore_initial_pool_size", false)) == false } compartment_id = each.value.compartment_id display_name = each.key size = each.value.size @@ -61,3 +61,63 @@ resource "oci_core_instance_pool" "workers" { } } } + +resource "oci_core_instance_pool" "autoscaled_workers" { + # Create an OCI Instance Pool resource for each enabled entry of the worker_pools map with that mode. + for_each = { for key, value in local.enabled_instance_pools: key => value if tobool(lookup(value, "ignore_initial_pool_size", false)) == true } + compartment_id = each.value.compartment_id + display_name = each.key + size = each.value.size + instance_configuration_id = oci_core_instance_configuration.workers[each.key].id + defined_tags = each.value.defined_tags + freeform_tags = each.value.freeform_tags + + dynamic "placement_configurations" { + for_each = each.value.availability_domains + iterator = ad + + content { + availability_domain = ad.value + primary_subnet_id = each.value.subnet_id + + # Value(s) specified on pool, or null to select automatically + fault_domains = try(each.value.placement_fds, null) + + dynamic "secondary_vnic_subnets" { + for_each = lookup(each.value, "secondary_vnics", {}) + iterator = vnic + content { + display_name = vnic.key + subnet_id = lookup(vnic.value, "subnet_id", each.value.subnet_id) + } + } + } + } + + lifecycle { + ignore_changes = [ + display_name, defined_tags, freeform_tags, + placement_configurations, size + ] + + precondition { + condition = coalesce(each.value.image_id, "none") != "none" + error_message = <<-EOT + Missing image_id; check provided value if image_type is 'custom', or image_os/image_os_version if image_type is 'oke' or 'platform'. + pool: ${each.key} + image_type: ${coalesce(each.value.image_type, "none")} + image_id: ${coalesce(each.value.image_id, "none")} + EOT + } + + precondition { + condition = var.cni_type == "flannel" + error_message = "Instance Pools require a cluster with `cni_type = flannel`." + } + + precondition { + condition = each.value.autoscale == false + error_message = "Instance Pools do not support cluster autoscaler management." + } + } +} diff --git a/modules/workers/locals.tf b/modules/workers/locals.tf index f239e12f..85ddf45e 100644 --- a/modules/workers/locals.tf +++ b/modules/workers/locals.tf @@ -36,6 +36,7 @@ locals { eviction_grace_duration = 300 force_node_delete = true extended_metadata = {} # empty pool-specific default + ignore_initial_pool_size = false image_id = var.image_id image_type = var.image_type kubernetes_version = var.kubernetes_version @@ -231,9 +232,9 @@ locals { } # Maps of worker pool OCI resources by pool name enriched with desired/custom parameters for various modes - worker_node_pools = { for k, v in oci_containerengine_node_pool.workers : k => merge(v, lookup(local.worker_pools_final, k, {})) } + worker_node_pools = { for k, v in merge(oci_containerengine_node_pool.tfscaled_workers, oci_containerengine_node_pool.autoscaled_workers) : k => merge(v, lookup(local.worker_pools_final, k, {})) } worker_virtual_node_pools = { for k, v in oci_containerengine_virtual_node_pool.workers : k => merge(v, lookup(local.worker_pools_final, k, {})) } - worker_instance_pools = { for k, v in oci_core_instance_pool.workers : k => merge(v, lookup(local.worker_pools_final, k, {})) } + worker_instance_pools = { for k, v in merge(oci_core_instance_pool.tfscaled_workers, oci_core_instance_pool.autoscaled_workers) : k => merge(v, lookup(local.worker_pools_final, k, {})) } worker_cluster_networks = { for k, v in oci_core_cluster_network.workers : k => merge(v, lookup(local.worker_pools_final, k, {})) } worker_instances = { for k, v in oci_core_instance.workers : k => merge(v, lookup(local.worker_pools_final, k, {})) } diff --git a/modules/workers/nodepools.tf b/modules/workers/nodepools.tf index bcc3f3df..7d01ddad 100644 --- a/modules/workers/nodepools.tf +++ b/modules/workers/nodepools.tf @@ -2,9 +2,9 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl # Dynamic resource block for Node Pool groups defined in worker_pools -resource "oci_containerengine_node_pool" "workers" { +resource "oci_containerengine_node_pool" "tfscaled_workers" { # Create an OKE node pool resource for each enabled entry of the worker_pools map with that mode. - for_each = local.enabled_node_pools + for_each = { for key, value in local.enabled_node_pools: key => value if tobool(lookup(value, "ignore_initial_pool_size", false)) == false } cluster_id = var.cluster_id compartment_id = each.value.compartment_id defined_tags = each.value.defined_tags @@ -156,3 +156,158 @@ resource "oci_containerengine_node_pool" "workers" { } } } + +resource "oci_containerengine_node_pool" "autoscaled_workers" { + # Create an OKE node pool resource for each enabled entry of the worker_pools map with that mode. + for_each = { for key, value in local.enabled_node_pools: key => value if tobool(lookup(value, "ignore_initial_pool_size", false)) == true } + cluster_id = var.cluster_id + compartment_id = each.value.compartment_id + defined_tags = each.value.defined_tags + freeform_tags = each.value.freeform_tags + kubernetes_version = each.value.kubernetes_version + name = each.key + node_shape = each.value.shape + ssh_public_key = var.ssh_public_key + + node_config_details { + size = each.value.size + is_pv_encryption_in_transit_enabled = each.value.pv_transit_encryption + kms_key_id = each.value.volume_kms_key_id + nsg_ids = each.value.nsg_ids + defined_tags = each.value.defined_tags + freeform_tags = each.value.freeform_tags + + dynamic "placement_configs" { + for_each = each.value.availability_domains + iterator = ad + + content { + availability_domain = ad.value + capacity_reservation_id = each.value.capacity_reservation_id + subnet_id = each.value.subnet_id + + # Value(s) specified on pool, or null to select automatically + fault_domains = try(each.value.placement_fds, null) + + dynamic "preemptible_node_config" { + for_each = each.value.preemptible_config.enable ? [1] : [] + content { + preemption_action { + type = "TERMINATE" + is_preserve_boot_volume = each.value.preemptible_config.is_preserve_boot_volume + } + } + } + } + } + + dynamic "node_pool_pod_network_option_details" { + for_each = var.cni_type == "flannel" ? [1] : [] + content { # Flannel requires cni type only + cni_type = "FLANNEL_OVERLAY" + } + } + + dynamic "node_pool_pod_network_option_details" { + for_each = var.cni_type == "npn" ? [1] : [] + content { # VCN-Native requires max pods/node, nsg ids, subnet ids + cni_type = "OCI_VCN_IP_NATIVE" + max_pods_per_node = each.value.max_pods_per_node + pod_nsg_ids = compact(tolist(each.value.pod_nsg_ids)) + pod_subnet_ids = compact(tolist([each.value.pod_subnet_id])) + } + } + } + + node_metadata = merge( + { + apiserver_host = var.apiserver_private_host + oke-kubeproxy-proxy-mode = var.kubeproxy_mode + user_data = lookup(lookup(data.cloudinit_config.workers, each.key, {}), "rendered", "") + }, + + # Only provide cluster DNS service address if set explicitly; determined automatically in practice. + coalesce(var.cluster_dns, "none") == "none" ? {} : { kubedns_svc_ip = var.cluster_dns }, + + # Extra user-defined fields merged last + var.node_metadata, # global + lookup(each.value, "node_metadata", {}), # pool-specific + ) + + node_eviction_node_pool_settings { + eviction_grace_duration = (floor(tonumber(each.value.eviction_grace_duration) / 60) > 0 ? + (each.value.eviction_grace_duration > 3600 ? + format("PT%dM", 60) : + (each.value.eviction_grace_duration % 60 == 0 ? + format("PT%dM", floor(each.value.eviction_grace_duration / 60)) : + format("PT%dM%dS", floor(each.value.eviction_grace_duration / 60), each.value.eviction_grace_duration % 60) + ) + ) : + format("PT%dS", each.value.eviction_grace_duration) + ) + is_force_delete_after_grace_duration = tobool(each.value.force_node_delete) + } + + dynamic "node_shape_config" { + for_each = length(regexall("Flex", each.value.shape)) > 0 ? [1] : [] + content { + ocpus = each.value.ocpus + memory_in_gbs = ( # If > 64GB memory/core, correct input to exactly 64GB memory/core + (each.value.memory / each.value.ocpus) > 64 ? each.value.ocpus * 64 : each.value.memory + ) + } + } + + node_pool_cycling_details { + is_node_cycling_enabled = each.value.node_cycling_enabled + maximum_surge = each.value.node_cycling_max_surge + maximum_unavailable = each.value.node_cycling_max_unavailable + } + + node_source_details { + boot_volume_size_in_gbs = each.value.boot_volume_size + image_id = each.value.image_id + source_type = "image" + } + + lifecycle { # prevent resources changes for changed fields + ignore_changes = [ + # kubernetes_version, # e.g. if changed as part of an upgrade + name, defined_tags, freeform_tags, + node_metadata["user_data"], # templated cloud-init + node_config_details[0].placement_configs, # dynamic placement configs + node_config_details[0].size # size + ] + + precondition { + condition = coalesce(each.value.image_id, "none") != "none" + error_message = <<-EOT + Missing image_id; check provided value if image_type is 'custom', or image_os/image_os_version if image_type is 'oke' or 'platform'. + pool: ${each.key} + image_type: ${coalesce(each.value.image_type, "none")} + image_id: ${coalesce(each.value.image_id, "none")} + EOT + } + + precondition { + condition = anytrue([ + contains(["instance-pool", "cluster-network"], each.value.mode), # supported modes + length(lookup(each.value, "secondary_vnics", {})) == 0, # unrestricted when empty/unset + ]) + error_message = "Unsupported option for mode=${each.value.mode}: secondary_vnics" + } + + precondition { + condition = coalesce(each.value.capacity_reservation_id, "none") == "none" || length(each.value.availability_domains) == 1 + error_message = "A single availability domain must be specified when using a capacity reservation with mode=${each.value.mode}" + } + } + + dynamic "initial_node_labels" { + for_each = each.value.node_labels + content { + key = initial_node_labels.key + value = initial_node_labels.value + } + } +}