GoogleCloudPlatform · umeshkumhar · Mar 7, 2024 · Mar 6, 2024 · Mar 6, 2024 · Mar 7, 2024
diff --git a/applications/rag/main.tf b/applications/rag/main.tf
@@ -31,7 +31,7 @@ module "infra" {
 
   project_id        = var.project_id
   cluster_name      = var.cluster_name
-  cluster_region    = var.cluster_location
+  cluster_location  = var.cluster_location
   autopilot_cluster = var.autopilot_cluster
   private_cluster   = var.private_cluster
   create_network    = false

diff --git a/applications/rag/metadata.display.yaml b/applications/rag/metadata.display.yaml
@@ -25,7 +25,10 @@ spec:
           title: Cluster Location
           section: cluster_details
           xGoogleProperty:
-            type: ET_GCE_LOCATION
+            type: ET_GCE_REGION
+            # specified regions have L4 & T4 GPUs https://cloud.google.com/compute/docs/gpus/gpu-regions-zones#view-using-tools
+            gce_region:
+              allowlisted_regions: ["asia-east1","asia-northeast1","asia-northeast3","asia-south1","asia-southeast1","europe-west1","europe-west2","europe-west3","europe-west4","us-central1","us-east1","us-east4","us-west1","us-west4"]
         cluster_name:
           name: cluster_name
           title: Cluster Name
@@ -43,7 +46,6 @@ spec:
         autopilot_cluster:
           name: autopilot_cluster
           title: Autopilot Cluster
-          invisible: true
           section: cluster_details
         create_gcs_bucket:
           name: create_gcs_bucket

diff --git a/applications/rag/metadata.yaml b/applications/rag/metadata.yaml
@@ -20,7 +20,7 @@ spec:
       - name: add_auth
         description: Enable iap authentication on jupyterhub
         varType: bool
-        defaultValue: true
+        defaultValue: false
       - name: autopilot_cluster
         varType: bool
         defaultValue: false

diff --git a/applications/rag/variables.tf b/applications/rag/variables.tf
@@ -274,7 +274,7 @@ variable "cpu_pools" {
   type = list(object({
     name                   = string
     machine_type           = string
-    node_locations         = string
+    node_locations         = optional(string, "")
     autoscaling            = optional(bool, false)
     min_count              = optional(number, 1)
     max_count              = optional(number, 3)
@@ -294,22 +294,21 @@ variable "cpu_pools" {
     accelerator_count      = optional(number, 0)
   }))
   default = [{
-    name           = "cpu-pool"
-    machine_type   = "n1-standard-16"
-    node_locations = "us-central1-b,us-central1-c"
-    autoscaling    = true
-    min_count      = 1
-    max_count      = 3
-    disk_size_gb   = 100
-    disk_type      = "pd-standard"
+    name         = "cpu-pool"
+    machine_type = "n1-standard-16"
+    autoscaling  = true
+    min_count    = 1
+    max_count    = 3
+    disk_size_gb = 100
+    disk_type    = "pd-standard"
   }]
 }
 
 variable "gpu_pools" {
   type = list(object({
     name                   = string
     machine_type           = string
-    node_locations         = string
+    node_locations         = optional(string, "")
     autoscaling            = optional(bool, false)
     min_count              = optional(number, 1)
     max_count              = optional(number, 3)
@@ -330,24 +329,21 @@ variable "gpu_pools" {
     accelerator_type       = optional(string, "nvidia-tesla-t4")
     gpu_driver_version     = optional(string, "DEFAULT")
   }))
-  default = [
-    {
-      name               = "gpu-pool"
-      machine_type       = "n1-standard-16"
-      node_locations     = "us-central1-b,us-central1-c"
-      autoscaling        = true
-      min_count          = 1
-      max_count          = 3
-      disk_size_gb       = 100
-      disk_type          = "pd-standard"
-      accelerator_count  = 2
-      accelerator_type   = "nvidia-tesla-t4"
-      gpu_driver_version = "DEFAULT"
+  default = [{
+    name               = "gpu-pool"
+    machine_type       = "n1-standard-16"
+    autoscaling        = true
+    min_count          = 1
+    max_count          = 3
+    disk_size_gb       = 100
+    disk_type          = "pd-standard"
+    accelerator_count  = 2
+    accelerator_type   = "nvidia-tesla-t4"
+    gpu_driver_version = "DEFAULT"
     },
     {
       name               = "gpu-pool-l4"
       machine_type       = "g2-standard-24"
-      node_locations     = "us-central1-a"
       autoscaling        = true
       min_count          = 1
       max_count          = 3
@@ -357,6 +353,5 @@ variable "gpu_pools" {
       accelerator_count  = 2
       accelerator_type   = "nvidia-l4"
       gpu_driver_version = "DEFAULT"
-    }
-  ]
+  }]
 }
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
@@ -59,7 +59,7 @@ steps:
         -var-file=tfvars_tests/standard-gke-public.platform.tfvars \
         -var=project_id=$PROJECT_ID \
         -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-        -var=cluster_region=$_REGION \
+        -var=cluster_location=$_REGION \
         -auto-approve -no-color -lock=false
         echo "pass" > /workspace/gke_cluster_result.txt
     dir: 'infrastructure/'
@@ -300,7 +300,7 @@ steps:
         cd /workspace/infrastructure
         terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \
         -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-        -var=cluster_region=$_REGION -auto-approve -no-color
+        -var=cluster_location=$_REGION -auto-approve -no-color
 
     allowFailure: true
     waitFor: ['cleanup rag']

diff --git a/infrastructure/main.tf b/infrastructure/main.tf
@@ -16,6 +16,26 @@
 ####    PLATFORM
 #######################################################
 
+## GPU locations where L4 & T4 are supported.
+locals {
+  gpu_l4_t4_location = {
+    asia-east1      = "asia-east1-a,asia-east1-c"
+    asia-northeast1 = "asia-northeast1-a,asia-northeast1-c"
+    asia-northeast3 = "asia-northeast3-b"
+    asia-south1     = "asia-south1-a,asia-south1-b"
+    asia-southeast1 = "asia-southeast1-a,asia-southeast1-b,asia-southeast1-c"
+    europe-west1    = "europe-west1-b,europe-west1-c"
+    europe-west2    = "europe-west2-a,europe-west2-b"
+    europe-west3    = "europe-west3-b"
+    europe-west4    = "europe-west4-a,europe-west4-b,europe-west4-c"
+    us-central1     = "us-central1-a,us-central1-b,us-central1-c"
+    us-east1        = "us-east1-c,us-east1-d"
+    us-east4        = "us-east4-a,us-east4-c"
+    us-west1        = "us-west1-a,us-west1-b"
+    us-west4        = "us-west4-a"
+  }
+}
+
 module "custom-network" {
   source       = "terraform-google-modules/network/google"
   version      = "8.0.0"
@@ -40,8 +60,14 @@ module "custom-network" {
 locals {
   network_name    = var.create_network ? module.custom-network[0].network_name : var.network_name
   subnetwork_name = var.create_network ? module.custom-network[0].subnets_names[0] : var.subnetwork_name
+  region          = length(split("-", var.cluster_location)) == 2 ? var.cluster_location : ""
+  regional        = local.region != "" ? true : false
+  zone            = length(split("-", var.cluster_location)) > 2 ? split(",", var.cluster_location) : []
+  # Update gpu_pools with node_locations according to region and zone gpu availibility, if not provided
+  gpu_pools = [for elm in var.gpu_pools : (local.regional && contains(keys(local.gpu_l4_t4_location), local.region) && elm["node_locations"] == "") ? merge(elm, { "node_locations" : local.gpu_l4_t4_location[local.region] }) : elm]
 }
 
+
 ## create public GKE standard
 module "public-gke-standard-cluster" {
   count      = var.create_cluster && !var.private_cluster && !var.autopilot_cluster ? 1 : 0
@@ -53,13 +79,13 @@ module "public-gke-standard-cluster" {
   subnetwork_name = local.subnetwork_name
 
   ## gke variables
-  cluster_regional                     = var.cluster_regional
+  cluster_regional                     = local.regional
+  cluster_region                       = local.region
+  cluster_zones                        = local.zone
   cluster_name                         = var.cluster_name
   cluster_labels                       = var.cluster_labels
   kubernetes_version                   = var.kubernetes_version
   release_channel                      = var.release_channel
-  cluster_region                       = var.cluster_region
-  cluster_zones                        = var.cluster_zones
   ip_range_pods                        = var.ip_range_pods
   ip_range_services                    = var.ip_range_services
   monitoring_enable_managed_prometheus = var.monitoring_enable_managed_prometheus
@@ -70,7 +96,7 @@ module "public-gke-standard-cluster" {
   ## pools config variables
   cpu_pools                   = var.cpu_pools
   enable_gpu                  = var.enable_gpu
-  gpu_pools                   = var.gpu_pools
+  gpu_pools                   = local.gpu_pools
   enable_tpu                  = var.enable_tpu
   tpu_pools                   = var.tpu_pools
   all_node_pools_oauth_scopes = var.all_node_pools_oauth_scopes
@@ -90,13 +116,13 @@ module "public-gke-autopilot-cluster" {
   subnetwork_name = local.subnetwork_name
 
   ## gke variables
-  cluster_regional           = var.cluster_regional
+  cluster_regional           = local.regional
+  cluster_region             = local.region
+  cluster_zones              = local.zone
   cluster_name               = var.cluster_name
   cluster_labels             = var.cluster_labels
   kubernetes_version         = var.kubernetes_version
   release_channel            = var.release_channel
-  cluster_region             = var.cluster_region
-  cluster_zones              = var.cluster_zones
   ip_range_pods              = var.ip_range_pods
   ip_range_services          = var.ip_range_services
   master_authorized_networks = var.master_authorized_networks
@@ -115,13 +141,13 @@ module "private-gke-standard-cluster" {
   subnetwork_name = local.subnetwork_name
 
   ## gke variables
-  cluster_regional                     = var.cluster_regional
+  cluster_regional                     = local.regional
+  cluster_region                       = local.region
+  cluster_zones                        = local.zone
   cluster_name                         = var.cluster_name
   cluster_labels                       = var.cluster_labels
   kubernetes_version                   = var.kubernetes_version
   release_channel                      = var.release_channel
-  cluster_region                       = var.cluster_region
-  cluster_zones                        = var.cluster_zones
   ip_range_pods                        = var.ip_range_pods
   ip_range_services                    = var.ip_range_services
   monitoring_enable_managed_prometheus = var.monitoring_enable_managed_prometheus
@@ -133,7 +159,7 @@ module "private-gke-standard-cluster" {
   ## pools config variables
   cpu_pools                   = var.cpu_pools
   enable_gpu                  = var.enable_gpu
-  gpu_pools                   = var.gpu_pools
+  gpu_pools                   = local.gpu_pools
   enable_tpu                  = var.enable_tpu
   tpu_pools                   = var.tpu_pools
   all_node_pools_oauth_scopes = var.all_node_pools_oauth_scopes
@@ -153,13 +179,13 @@ module "private-gke-autopilot-cluster" {
   subnetwork_name = local.subnetwork_name
 
   ## gke variables
-  cluster_regional           = var.cluster_regional
+  cluster_regional           = local.regional
+  cluster_region             = local.region
+  cluster_zones              = local.zone
   cluster_name               = var.cluster_name
   cluster_labels             = var.cluster_labels
   kubernetes_version         = var.kubernetes_version
   release_channel            = var.release_channel
-  cluster_region             = var.cluster_region
-  cluster_zones              = var.cluster_zones
   ip_range_pods              = var.ip_range_pods
   ip_range_services          = var.ip_range_services
   master_authorized_networks = var.master_authorized_networks

diff --git a/infrastructure/outputs.tf b/infrastructure/outputs.tf
@@ -20,8 +20,8 @@ output "cluster_name" {
   value = var.cluster_name
 }
 
-output "cluster_region" {
-  value = var.cluster_region
+output "cluster_location" {
+  value = var.cluster_location
 }
 
 output "endpoint" {

diff --git a/infrastructure/platform.tfvars b/infrastructure/platform.tfvars
@@ -20,34 +20,32 @@ project_id = "<your project ID>"
 ## network values
 create_network    = true
 network_name      = "ml-network"
-subnetwork_name   = "ml-subnet1"
+subnetwork_name   = "ml-subnet"
 subnetwork_cidr   = "10.100.0.0/16"
 subnetwork_region = "us-central1"
 
 ## gke variables
 private_cluster   = false ## true = private cluster, false = public cluster
 autopilot_cluster = false ## true = autopilot cluster, false = standard cluster
 cluster_name      = "ml-cluster"
-cluster_region    = "us-central1"
-cluster_zones     = ["us-central1-a", "us-central1-b", "us-central1-c"]
+cluster_location  = "us-central1"
 
 cpu_pools = [{
-  name           = "cpu-pool"
-  machine_type   = "n1-standard-16"
-  node_locations = "us-central1-b,us-central1-c"
-  autoscaling    = true
-  min_count      = 1
-  max_count      = 3
-  disk_size_gb   = 100
-  disk_type      = "pd-standard"
+  name         = "cpu-pool"
+  machine_type = "n1-standard-16"
+  autoscaling  = true
+  min_count    = 1
+  max_count    = 3
+  disk_size_gb = 100
+  disk_type    = "pd-standard"
 }]
 
 ## make sure required gpu quotas are available in that region
 enable_gpu = true
 gpu_pools = [{
   name               = "gpu-pool"
   machine_type       = "n1-standard-16"
-  node_locations     = "us-central1-a"
+  node_locations     = "us-central1-a" ## comment to autofill node_location based on cluster_location
   autoscaling        = true
   min_count          = 1
   max_count          = 3
@@ -60,7 +58,7 @@ gpu_pools = [{
   {
     name               = "gpu-pool-l4"
     machine_type       = "g2-standard-24"
-    node_locations     = "us-central1-a"
+    node_locations     = "us-central1-a" ## comment to autofill node_location based on cluster_location
     autoscaling        = true
     min_count          = 1
     max_count          = 3

diff --git a/infrastructure/tfvars_examples/autopilot-gke-with-existing-network.platform.tfvars b/infrastructure/tfvars_examples/autopilot-gke-with-existing-network.platform.tfvars
@@ -27,5 +27,4 @@ subnetwork_name = "demo-subnet"
 private_cluster   = true ## Default true. Use false for a public cluster
 autopilot_cluster = true # false = standard cluster, true = autopilot cluster
 cluster_name      = "demo-cluster"
-cluster_region    = "us-central1"
-cluster_zones     = ["us-central1-a", "us-central1-b", "us-central1-f"]
+cluster_location  = "us-central1" ## Zonal autopilot clusters are not supported.
diff --git a/infrastructure/tfvars_examples/autopilot-gke-with-new-network.platform.tfvars b/infrastructure/tfvars_examples/autopilot-gke-with-new-network.platform.tfvars
@@ -30,5 +30,4 @@ subnetwork_region = "us-central1"
 private_cluster   = true ## Default true. Use false for a public cluster
 autopilot_cluster = true # false = standard cluster, true = autopilot cluster
 cluster_name      = "demo-cluster"
-cluster_region    = "us-central1"
-cluster_zones     = ["us-central1-a", "us-central1-b", "us-central1-f"]
+cluster_location  = "us-central1" ## Zonal autopilot clusters are not supported.
diff --git a/infrastructure/tfvars_examples/platform.complete.tfvars b/infrastructure/tfvars_examples/platform.complete.tfvars
@@ -47,9 +47,7 @@ private_cluster                      = true ## Default true. Use false for a pub
 autopilot_cluster                    = true # false = standard cluster, true = autopilot cluster
 cluster_name                         = "ml-cluster"
 kubernetes_version                   = "1.28"
-cluster_regional                     = true
-cluster_region                       = "us-central1"
-cluster_zones                        = ["us-central1-a", "us-central1-b", "us-central1-f"]
+cluster_location                     = "us-central1"
 ip_range_pods                        = "us-central1-01-gke-01-pods-1"
 ip_range_services                    = "us-central1-01-gke-01-services-1"
 monitoring_enable_managed_prometheus = true

diff --git a/infrastructure/tfvars_examples/standard-gke-with-exisiting-network.platform.tfvars b/infrastructure/tfvars_examples/standard-gke-with-exisiting-network.platform.tfvars
@@ -32,26 +32,23 @@ private_cluster = true ## Default true. Use false for a public cluster
 # }]
 autopilot_cluster = false # false = standard cluster, true = autopilot cluster
 cluster_name      = "demo-cluster-1"
-cluster_region    = "us-central1"
-cluster_zones     = ["us-central1-a", "us-central1-b", "us-central1-f"]
+cluster_location  = "us-central1"
 
 cpu_pools = [{
-  name           = "cpu-pool"
-  machine_type   = "n1-standard-16"
-  node_locations = "us-central1-b,us-central1-c"
-  autoscaling    = true
-  min_count      = 1
-  max_count      = 3
-  disk_size_gb   = 100
-  disk_type      = "pd-standard"
+  name         = "cpu-pool"
+  machine_type = "n1-standard-16"
+  autoscaling  = true
+  min_count    = 1
+  max_count    = 3
+  disk_size_gb = 100
+  disk_type    = "pd-standard"
 }]
 
 ## make sure required gpu quotas are available in that region
 enable_gpu = true
 gpu_pools = [{
   name               = "gpu-pool"
   machine_type       = "n1-standard-16"
-  node_locations     = "us-central1-b,us-central1-c"
   autoscaling        = true
   min_count          = 1
   max_count          = 3