diff --git a/README.md b/README.md
index 3dfc92f..94657b6 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,22 @@ preparation, however it does have the following pre-requisites:
## How to Use This Module
+## Cluster Sizing
+
+By default, the type of kubernetes instances, number of instances, redis cluster size, and database instance sizes are
+standardized via configurations in [./deployment-size.tf](deployment-size.tf), and is configured via the `size` input
+variable.
+
+Available sizes are, `small`, `medium`, `large`, `xlarge`, and `xxlarge`. Default is `small`.
+
+All the values set via `deployment-size.tf` can be overridden by setting the appropriate input variables.
+
+- `kubernetes_instance_type` - The instance type for the EKS nodes
+- `kubernetes_min_node_per_az` - The minimum number of nodes in the EKS cluster
+- `kubernetes_max_node_per_az` - The maximum number of nodes in the EKS cluster
+- `redis_capacity` - The instance type for the redis cluster
+- `database_sku_name` - The instance type for the database
+
## Examples
We have included documentation and reference examples for additional common
@@ -87,7 +103,7 @@ resources that lack official modules.
| [create\_private\_link](#input\_create\_private\_link) | Use for the azure private link. | `bool` | `false` | no |
| [create\_redis](#input\_create\_redis) | Boolean indicating whether to provision an redis instance (true) or not (false). | `bool` | `false` | no |
| [database\_availability\_mode](#input\_database\_availability\_mode) | n/a | `string` | `"SameZone"` | no |
-| [database\_sku\_name](#input\_database\_sku\_name) | Specifies the SKU Name for this MySQL Server | `string` | `"GP_Standard_D4ds_v4"` | no |
+| [database\_sku\_name](#input\_database\_sku\_name) | Specifies the SKU Name for this MySQL Server. Defaults to null and value from deployment-size.tf is used | `string` | `null` | no |
| [database\_version](#input\_database\_version) | Version for MySQL | `string` | `"5.7"` | no |
| [deletion\_protection](#input\_deletion\_protection) | If the instance should have deletion protection enabled. The database / Bucket can't be deleted when this value is set to `true`. | `bool` | `true` | no |
| [disable\_storage\_vault\_key\_id](#input\_disable\_storage\_vault\_key\_id) | Flag to disable the `customer_managed_key` block, the properties 'encryption.identity, encryption.keyvaultproperties' cannot be updated in a single operation. | `bool` | `false` | no |
@@ -95,13 +111,14 @@ resources that lack official modules.
| [enable\_database\_vault\_key](#input\_enable\_database\_vault\_key) | Flag to enable managed key encryption for the database. Once enabled, cannot be disabled. | `bool` | `false` | no |
| [enable\_storage\_vault\_key](#input\_enable\_storage\_vault\_key) | Flag to enable managed key encryption for the storage account. | `bool` | `false` | no |
| [external\_bucket](#input\_external\_bucket) | config an external bucket | `any` | `null` | no |
-| [kubernetes\_instance\_type](#input\_kubernetes\_instance\_type) | Use for the Kubernetes cluster. | `string` | `"Standard_D4a_v4"` | no |
-| [kubernetes\_node\_count](#input\_kubernetes\_node\_count) | n/a | `number` | `2` | no |
+| [kubernetes\_instance\_type](#input\_kubernetes\_instance\_type) | Instance type for primary node group. Defaults to null and value from deployment-size.tf is used | `string` | `null` | no |
+| [kubernetes\_max\_node\_per\_az](#input\_kubernetes\_max\_node\_per\_az) | Maximum number of nodes for the AKS cluster. Defaults to null and value from deployment-size.tf is used | `number` | `null` | no |
+| [kubernetes\_min\_node\_per\_az](#input\_kubernetes\_min\_node\_per\_az) | Minimum number of nodes for the AKS cluster. Defaults to null and value from deployment-size.tf is used | `number` | `null` | no |
| [license](#input\_license) | Your wandb/local license | `string` | n/a | yes |
| [location](#input\_location) | n/a | `string` | n/a | yes |
| [namespace](#input\_namespace) | String used for prefix resources. | `string` | n/a | yes |
| [node\_max\_pods](#input\_node\_max\_pods) | Maximum number of pods per node | `number` | `30` | no |
-| [node\_pool\_num\_zones](#input\_node\_pool\_num\_zones) | Number of availability zones to use for the node pool when node\_pool\_zones is not set. | `number` | `2` | no |
+| [node\_pool\_num\_zones](#input\_node\_pool\_num\_zones) | Number of availability zones to use for the node pool when node\_pool\_zones is not set. If neither are set, 3 zones will be used | `number` | `2` | no |
| [node\_pool\_zones](#input\_node\_pool\_zones) | Availability zones for the node pool | `list(string)` | `null` | no |
| [oidc\_auth\_method](#input\_oidc\_auth\_method) | OIDC auth method | `string` | `"implicit"` | no |
| [oidc\_client\_id](#input\_oidc\_client\_id) | The Client ID of application in your identity provider | `string` | `""` | no |
@@ -110,8 +127,8 @@ resources that lack official modules.
| [operator\_chart\_version](#input\_operator\_chart\_version) | Version of the operator chart to deploy | `string` | `"1.3.4"` | no |
| [other\_wandb\_env](#input\_other\_wandb\_env) | Extra environment variables for W&B | `map(any)` | `{}` | no |
| [parquet\_wandb\_env](#input\_parquet\_wandb\_env) | Extra environment variables for W&B | `map(string)` | `{}` | no |
-| [redis\_capacity](#input\_redis\_capacity) | Number indicating size of an redis instance | `number` | `2` | no |
-| [size](#input\_size) | Deployment size | `string` | `null` | no |
+| [redis\_capacity](#input\_redis\_capacity) | Number indicating size of an redis instance. Defaults to null and value from deployment-size.tf is used | `number` | `null` | no |
+| [size](#input\_size) | Deployment size | `string` | `"small"` | no |
| [ssl](#input\_ssl) | Enable SSL certificate | `bool` | `true` | no |
| [storage\_account](#input\_storage\_account) | Azure storage account name | `string` | `""` | no |
| [storage\_key](#input\_storage\_key) | Azure primary storage access key | `string` | `""` | no |
@@ -127,7 +144,8 @@ resources that lack official modules.
| Name | Description |
|------|-------------|
| [address](#output\_address) | n/a |
-| [aks\_node\_count](#output\_aks\_node\_count) | n/a |
+| [aks\_max\_node\_count](#output\_aks\_max\_node\_count) | n/a |
+| [aks\_min\_node\_count](#output\_aks\_min\_node\_count) | n/a |
| [aks\_node\_instance\_type](#output\_aks\_node\_instance\_type) | n/a |
| [client\_id](#output\_client\_id) | n/a |
| [cluster\_ca\_certificate](#output\_cluster\_ca\_certificate) | n/a |
@@ -144,7 +162,27 @@ resources that lack official modules.
| [url](#output\_url) | The URL to the W&B application |
-## Migrations
+## Upgrading from 3.x to 4.x
+
+3.0.0 introduced autoscaling to the AKS cluster and made the `size` variable the preferred way to set the cluster size.
+Previously, unless the `size` variable was set explicitly, there were default values for the following variables:
+- `kubernetes_instance_type`
+- `kubernetes_node_count`
+- `redis_capacity`
+- `database_sku_name`
+
+The `size` variable is now defaulted to `small`, and the following values to can be used to partially override the values
+set by the `size` variable:
+- `kubernetes_instance_type`
+- `kubernetes_min_node_per_az`
+- `kubernetes_max_node_per_az`
+- `redis_capacity`
+- `database_sku_name`
+
+For more information on the available sizes, see the [Cluster Sizing](#cluster-sizing) section.
+
+If having the cluster scale nodes in and out is not desired, the `kubernetes_min_node_per_az` and
+`kubernetes_max_node_per_az` can be set to the same value to prevent the cluster from scaling.
### Upgrading from 2.x to 3.x
diff --git a/deployment-size.tf b/deployment-size.tf
index 9d9698e..1fd4b39 100644
--- a/deployment-size.tf
+++ b/deployment-size.tf
@@ -2,34 +2,39 @@ locals {
# Specifications for t-shirt sized deployments
deployment_size = {
small = {
- db = "MO_Standard_E2ds_v4",
- node_count = 2,
- node_instance = "Standard_E4s_v5"
- cache = "3"
+ db = "MO_Standard_E2ds_v4",
+ min_node_count = 1,
+ max_node_count = 2,
+ node_instance = "Standard_E4s_v5"
+ cache = "3"
},
medium = {
- db = "MO_Standard_E4ds_v4",
- node_count = 2,
- node_instance = "Standard_E4s_v5"
- cache = "3"
+ db = "MO_Standard_E4ds_v4",
+ min_node_count = 1,
+ max_node_count = 2,
+ node_instance = "Standard_E4s_v5"
+ cache = "3"
},
large = {
- db = "MO_Standard_E8ds_v4",
- node_count = 3,
- node_instance = "Standard_E8s_v5"
- cache = "4"
+ db = "MO_Standard_E8ds_v4",
+ min_node_count = 1,
+ max_node_count = 2,
+ node_instance = "Standard_E8s_v5"
+ cache = "4"
},
xlarge = {
- db = "MO_Standard_E16ds_v4",
- node_count = 3,
- node_instance = "Standard_E8s_v5"
- cache = "4"
+ db = "MO_Standard_E16ds_v4",
+ min_node_count = 1,
+ max_node_count = 2,
+ node_instance = "Standard_E8s_v5"
+ cache = "4"
},
xxlarge = {
- db = "MO_Standard_E32ds_v4",
- node_count = 3,
- node_instance = "Standard_E16s_v5"
- cache = "5"
+ db = "MO_Standard_E32ds_v4",
+ min_node_count = 1,
+ max_node_count = 3,
+ node_instance = "Standard_E16s_v5"
+ cache = "5"
}
}
}
\ No newline at end of file
diff --git a/main.tf b/main.tf
index 943aebd..6088786 100644
--- a/main.tf
+++ b/main.tf
@@ -2,6 +2,12 @@ locals {
fqdn = var.subdomain == null ? var.domain_name : "${var.subdomain}.${var.domain_name}"
url_prefix = var.ssl ? "https" : "http"
url = "${local.url_prefix}://${local.fqdn}"
+
+ redis_capacity = coalesce(var.redis_capacity, local.deployment_size[var.size].cache)
+ database_sku_name = coalesce(var.database_sku_name, local.deployment_size[var.size].db)
+ kubernetes_instance_type = coalesce(var.kubernetes_instance_type, local.deployment_size[var.size].node_instance)
+ kubernetes_min_node_per_az = coalesce(var.kubernetes_min_node_per_az, local.deployment_size[var.size].min_node_count)
+ kubernetes_max_node_per_az = coalesce(var.kubernetes_max_node_per_az, local.deployment_size[var.size].max_node_count)
}
resource "azurerm_resource_group" "default" {
@@ -40,7 +46,7 @@ module "database" {
database_version = var.database_version
database_private_dns_zone_id = module.networking.database_private_dns_zone.id
database_subnet_id = module.networking.database_subnet.id
- sku_name = try(local.deployment_size[var.size].db, var.database_sku_name)
+ sku_name = local.database_sku_name
deletion_protection = var.deletion_protection
database_key_id = try(module.vault.vault_internal_keys[module.vault.vault_key_map.database].id, null)
@@ -58,7 +64,7 @@ module "redis" {
namespace = var.namespace
resource_group_name = azurerm_resource_group.default.name
location = azurerm_resource_group.default.location
- capacity = try(local.deployment_size[var.size].cache, var.redis_capacity)
+ capacity = local.redis_capacity
depends_on = [module.networking]
}
@@ -107,10 +113,6 @@ module "app_lb" {
tags = var.tags
}
-locals {
- kubernetes_instance_type = try(local.deployment_size[var.size].node_instance, var.kubernetes_instance_type)
-}
-
data "azapi_resource_list" "az_zones" {
parent_id = "/subscriptions/${data.azurerm_subscription.current.subscription_id}"
type = "Microsoft.Compute/skus@2021-07-01"
@@ -139,20 +141,20 @@ module "app_aks" {
source = "./modules/app_aks"
depends_on = [module.app_lb]
- cluster_subnet_id = module.networking.private_subnet.id
- etcd_key_vault_key_id = module.vault.etcd_key_id
- gateway = module.app_lb.gateway
- identity = module.identity.identity
- location = azurerm_resource_group.default.location
- namespace = var.namespace
- node_pool_vm_count = try(local.deployment_size[var.size].node_count, var.kubernetes_node_count)
- node_pool_vm_size = local.kubernetes_instance_type
- node_pool_zones = local.node_pool_zones
- public_subnet = module.networking.public_subnet
- resource_group = azurerm_resource_group.default
- sku_tier = var.cluster_sku_tier
- max_pods = var.node_max_pods
- tags = var.tags
+ cluster_subnet_id = module.networking.private_subnet.id
+ etcd_key_vault_key_id = module.vault.etcd_key_id
+ gateway = module.app_lb.gateway
+ identity = module.identity.identity
+ location = azurerm_resource_group.default.location
+ namespace = var.namespace
+ node_pool_min_vm_per_az = local.kubernetes_min_node_per_az
+ node_pool_max_vm_per_az = local.kubernetes_max_node_per_az
+ node_pool_vm_size = local.kubernetes_instance_type
+ node_pool_zones = local.node_pool_zones
+ public_subnet = module.networking.public_subnet
+ resource_group = azurerm_resource_group.default
+ sku_tier = var.cluster_sku_tier
+ tags = var.tags
}
locals {
service_account_name = "wandb-app"
diff --git a/modules/app_aks/main.tf b/modules/app_aks/main.tf
index 0364275..956adc8 100644
--- a/modules/app_aks/main.tf
+++ b/modules/app_aks/main.tf
@@ -18,15 +18,17 @@ resource "azurerm_kubernetes_cluster" "default" {
}
default_node_pool {
- enable_auto_scaling = false
+ enable_auto_scaling = true
max_pods = var.max_pods
name = "default"
- node_count = var.node_pool_vm_count
+ node_count = var.node_pool_min_vm_per_az
+ max_count = var.node_pool_max_vm_per_az
+ min_count = var.node_pool_min_vm_per_az
temporary_name_for_rotation = "rotating"
type = "VirtualMachineScaleSets"
vm_size = var.node_pool_vm_size
vnet_subnet_id = var.cluster_subnet_id
- zones = var.node_pool_zones
+ zones = [ var.node_pool_zones[0] ]
}
identity {
@@ -43,7 +45,7 @@ resource "azurerm_kubernetes_cluster" "default" {
tags = var.tags
lifecycle {
- ignore_changes = [microsoft_defender]
+ ignore_changes = [microsoft_defender, default_node_pool.0.node_count]
}
key_management_service {
@@ -51,27 +53,49 @@ resource "azurerm_kubernetes_cluster" "default" {
}
}
+locals {
+ additonal_zones = slice(var.node_pool_zones, 1, length(var.node_pool_zones))
+}
+
+resource "azurerm_kubernetes_cluster_node_pool" "additional" {
+ count = length(local.additonal_zones)
+ kubernetes_cluster_id = azurerm_kubernetes_cluster.default.id
+ enable_auto_scaling = true
+ max_pods = var.max_pods
+ name = "zone${local.additonal_zones[count.index]}"
+ node_count = var.node_pool_min_vm_per_az
+ max_count = var.node_pool_max_vm_per_az
+ min_count = var.node_pool_min_vm_per_az
+ vm_size = var.node_pool_vm_size
+ vnet_subnet_id = var.cluster_subnet_id
+ zones = [ local.additonal_zones[count.index] ]
+
+ lifecycle {
+ ignore_changes = [node_count]
+ }
+}
+
locals {
ingress_gateway_principal_id = azurerm_kubernetes_cluster.default.ingress_application_gateway.0.ingress_application_gateway_identity.0.object_id
}
resource "azurerm_role_assignment" "gateway" {
- depends_on = [ local.ingress_gateway_principal_id ]
+ depends_on = [local.ingress_gateway_principal_id]
scope = var.gateway.id
role_definition_name = "Contributor"
principal_id = local.ingress_gateway_principal_id
}
resource "azurerm_role_assignment" "resource_group" {
- depends_on = [ local.ingress_gateway_principal_id ]
+ depends_on = [local.ingress_gateway_principal_id]
scope = var.resource_group.id
role_definition_name = "Reader"
principal_id = local.ingress_gateway_principal_id
}
resource "azurerm_role_assignment" "public_subnet" {
- depends_on = [ local.ingress_gateway_principal_id ]
+ depends_on = [local.ingress_gateway_principal_id]
scope = var.public_subnet.id
role_definition_name = "Contributor"
principal_id = local.ingress_gateway_principal_id
diff --git a/modules/app_aks/variables.tf b/modules/app_aks/variables.tf
index 772fc35..77709b5 100644
--- a/modules/app_aks/variables.tf
+++ b/modules/app_aks/variables.tf
@@ -46,7 +46,11 @@ variable "node_pool_vm_size" {
type = string
}
-variable "node_pool_vm_count" {
+variable "node_pool_min_vm_per_az" {
+ type = number
+}
+
+variable "node_pool_max_vm_per_az" {
type = number
}
diff --git a/modules/app_lb/main.tf b/modules/app_lb/main.tf
index b046b30..518f200 100644
--- a/modules/app_lb/main.tf
+++ b/modules/app_lb/main.tf
@@ -17,7 +17,7 @@ locals {
listener_name = "${var.network.name}-httplstn"
request_routing_rule_name = "${var.network.name}-rqrt"
redirect_configuration_name = "${var.network.name}-rdrcfg"
- app_gateway_name = var.private_link ? "${var.namespace}-ag-private-link" : "${var.namespace}-ag"
+ app_gateway_name = var.private_link ? "${var.namespace}-ag-private-link" : "${var.namespace}-ag"
}
diff --git a/modules/app_lb/variables.tf b/modules/app_lb/variables.tf
index 01c19f6..4c25a71 100644
--- a/modules/app_lb/variables.tf
+++ b/modules/app_lb/variables.tf
@@ -39,6 +39,6 @@ variable "private_subnet" {
}
variable "private_link" {
- type = bool
+ type = bool
description = "Specifies the Azure private link creation"
}
\ No newline at end of file
diff --git a/modules/networking/main.tf b/modules/networking/main.tf
index a5f2bc5..1528407 100644
--- a/modules/networking/main.tf
+++ b/modules/networking/main.tf
@@ -9,10 +9,10 @@ resource "azurerm_virtual_network" "default" {
}
resource "azurerm_subnet" "private" {
- name = "${var.namespace}-private"
- resource_group_name = var.resource_group_name
- address_prefixes = [var.network_private_subnet_cidr]
- virtual_network_name = azurerm_virtual_network.default.name
+ name = "${var.namespace}-private"
+ resource_group_name = var.resource_group_name
+ address_prefixes = [var.network_private_subnet_cidr]
+ virtual_network_name = azurerm_virtual_network.default.name
private_link_service_network_policies_enabled = var.private_link ? false : true
service_endpoints = concat(
diff --git a/modules/networking/variables.tf b/modules/networking/variables.tf
index af10679..81735e7 100644
--- a/modules/networking/variables.tf
+++ b/modules/networking/variables.tf
@@ -56,7 +56,7 @@ variable "tags" {
}
variable "private_link" {
- type = bool
+ type = bool
description = "Private link flag for multi region storage endpoint access"
}
diff --git a/outputs.tf b/outputs.tf
index f1a9ab7..33c443d 100644
--- a/outputs.tf
+++ b/outputs.tf
@@ -45,16 +45,20 @@ output "standardized_size" {
value = var.size
}
-output "aks_node_count" {
- value = try(local.deployment_size[var.size].node_count, var.kubernetes_node_count)
+output "aks_min_node_count" {
+ value = local.kubernetes_min_node_per_az
+}
+
+output "aks_max_node_count" {
+ value = local.kubernetes_max_node_per_az
}
output "aks_node_instance_type" {
- value = try(local.deployment_size[var.size].node_instance, var.kubernetes_instance_type)
+ value = local.kubernetes_instance_type
}
output "database_instance_type" {
- value = try(local.deployment_size[var.size].db, var.database_sku_name)
+ value = local.database_sku_name
}
output "client_id" {
diff --git a/variables.tf b/variables.tf
index 5bf1eaa..6ff59d7 100644
--- a/variables.tf
+++ b/variables.tf
@@ -29,7 +29,7 @@ variable "use_internal_queue" {
}
variable "size" {
- default = null
+ default = "small"
description = "Deployment size"
nullable = true
type = string
@@ -147,8 +147,8 @@ variable "database_availability_mode" {
variable "database_sku_name" {
type = string
- default = "GP_Standard_D4ds_v4"
- description = "Specifies the SKU Name for this MySQL Server"
+ default = null
+ description = "Specifies the SKU Name for this MySQL Server. Defaults to null and value from deployment-size.tf is used"
}
##########################################
@@ -162,8 +162,8 @@ variable "create_redis" {
variable "redis_capacity" {
type = number
- description = "Number indicating size of an redis instance"
- default = 2
+ description = "Number indicating size of an redis instance. Defaults to null and value from deployment-size.tf is used"
+ default = null
}
##########################################
@@ -212,14 +212,21 @@ variable "bucket_path" {
# K8s #
##########################################
variable "kubernetes_instance_type" {
+ description = "Instance type for primary node group. Defaults to null and value from deployment-size.tf is used"
type = string
- description = "Use for the Kubernetes cluster."
- default = "Standard_D4a_v4"
+ default = null
}
-variable "kubernetes_node_count" {
- default = 2
- type = number
+variable "kubernetes_min_node_per_az" {
+ description = "Minimum number of nodes for the AKS cluster. Defaults to null and value from deployment-size.tf is used"
+ type = number
+ default = null
+}
+
+variable "kubernetes_max_node_per_az" {
+ description = "Maximum number of nodes for the AKS cluster. Defaults to null and value from deployment-size.tf is used"
+ type = number
+ default = null
}
variable "cluster_sku_tier" {
@@ -236,7 +243,7 @@ variable "node_pool_zones" {
variable "node_pool_num_zones" {
type = number
- description = "Number of availability zones to use for the node pool when node_pool_zones is not set."
+ description = "Number of availability zones to use for the node pool when node_pool_zones is not set. If neither are set, 3 zones will be used"
default = 2
}
diff --git a/vmtype_to_az.sh b/vmtype_to_az.sh
new file mode 100755
index 0000000..ae74523
--- /dev/null
+++ b/vmtype_to_az.sh
@@ -0,0 +1,38 @@
+#! /usr/bin/env bash
+
+# Given a Azure VM instance type and a region return the availability zones that support the instance type
+
+# Example:
+# ./vmtype_to_az.sh Standard_D2_v3 westeurope
+#
+# Output:
+# ["1", "2", "3"]
+
+# Copy script arguments to named environment variables
+VM_TYPE="$1"
+REGION="$2"
+NUM_ZONES="$3"
+
+# Check if both arguments are provided
+if [ -z "$VM_TYPE" ] || [ -z "$REGION" ]; then
+ echo "Error: Both VM type and region must be provided." >&2
+ echo "Usage: $0 " >&2
+ exit 1
+fi
+
+# Default to 3 zones if not specified
+if [ -z "$NUM_ZONES" ]; then
+ NUM_ZONES=3
+fi
+
+# Query Azure CLI for availability zones in the region for the specified VM type
+ZONES=$(az vm list-skus --location "$REGION" --size "$VM_TYPE" --query "[0].locationInfo[0].zones" -o json | jq -r -c "sort | .[0:$NUM_ZONES]")
+
+# Check if the query returned any results
+if [ -z "$ZONES" ] || [ "$ZONES" == "null" ]; then
+ echo "Error: No availability zones found for VM type $VM_TYPE in region $REGION." >&2
+ exit 1
+fi
+
+# Output the result
+jq -n --arg zones "$ZONES" '{"zones":$zones}'