diff --git a/applications/rag/main.tf b/applications/rag/main.tf index 63eb89ac4..e3e56ed47 100644 --- a/applications/rag/main.tf +++ b/applications/rag/main.tf @@ -31,7 +31,7 @@ module "infra" { project_id = var.project_id cluster_name = var.cluster_name - cluster_region = var.cluster_location + cluster_location = var.cluster_location autopilot_cluster = var.autopilot_cluster private_cluster = var.private_cluster create_network = false diff --git a/applications/rag/metadata.display.yaml b/applications/rag/metadata.display.yaml index de75aba7f..bbf7dca1c 100644 --- a/applications/rag/metadata.display.yaml +++ b/applications/rag/metadata.display.yaml @@ -25,7 +25,10 @@ spec: title: Cluster Location section: cluster_details xGoogleProperty: - type: ET_GCE_LOCATION + type: ET_GCE_REGION + # specified regions have L4 & T4 GPUs https://cloud.google.com/compute/docs/gpus/gpu-regions-zones#view-using-tools + gce_region: + allowlisted_regions: ["asia-east1","asia-northeast1","asia-northeast3","asia-south1","asia-southeast1","europe-west1","europe-west2","europe-west3","europe-west4","us-central1","us-east1","us-east4","us-west1","us-west4"] cluster_name: name: cluster_name title: Cluster Name @@ -43,7 +46,6 @@ spec: autopilot_cluster: name: autopilot_cluster title: Autopilot Cluster - invisible: true section: cluster_details create_gcs_bucket: name: create_gcs_bucket diff --git a/applications/rag/metadata.yaml b/applications/rag/metadata.yaml index b5a0861fa..74c749fe8 100644 --- a/applications/rag/metadata.yaml +++ b/applications/rag/metadata.yaml @@ -20,7 +20,7 @@ spec: - name: add_auth description: Enable iap authentication on jupyterhub varType: bool - defaultValue: true + defaultValue: false - name: autopilot_cluster varType: bool defaultValue: false diff --git a/applications/rag/variables.tf b/applications/rag/variables.tf index a5e77ff78..db4389322 100644 --- a/applications/rag/variables.tf +++ b/applications/rag/variables.tf @@ -274,7 +274,7 @@ variable "cpu_pools" { type = list(object({ name = string machine_type = string - node_locations = string + node_locations = optional(string, "") autoscaling = optional(bool, false) min_count = optional(number, 1) max_count = optional(number, 3) @@ -294,14 +294,13 @@ variable "cpu_pools" { accelerator_count = optional(number, 0) })) default = [{ - name = "cpu-pool" - machine_type = "n1-standard-16" - node_locations = "us-central1-b,us-central1-c" - autoscaling = true - min_count = 1 - max_count = 3 - disk_size_gb = 100 - disk_type = "pd-standard" + name = "cpu-pool" + machine_type = "n1-standard-16" + autoscaling = true + min_count = 1 + max_count = 3 + disk_size_gb = 100 + disk_type = "pd-standard" }] } @@ -309,7 +308,7 @@ variable "gpu_pools" { type = list(object({ name = string machine_type = string - node_locations = string + node_locations = optional(string, "") autoscaling = optional(bool, false) min_count = optional(number, 1) max_count = optional(number, 3) @@ -330,24 +329,21 @@ variable "gpu_pools" { accelerator_type = optional(string, "nvidia-tesla-t4") gpu_driver_version = optional(string, "DEFAULT") })) - default = [ - { - name = "gpu-pool" - machine_type = "n1-standard-16" - node_locations = "us-central1-b,us-central1-c" - autoscaling = true - min_count = 1 - max_count = 3 - disk_size_gb = 100 - disk_type = "pd-standard" - accelerator_count = 2 - accelerator_type = "nvidia-tesla-t4" - gpu_driver_version = "DEFAULT" + default = [{ + name = "gpu-pool" + machine_type = "n1-standard-16" + autoscaling = true + min_count = 1 + max_count = 3 + disk_size_gb = 100 + disk_type = "pd-standard" + accelerator_count = 2 + accelerator_type = "nvidia-tesla-t4" + gpu_driver_version = "DEFAULT" }, { name = "gpu-pool-l4" machine_type = "g2-standard-24" - node_locations = "us-central1-a" autoscaling = true min_count = 1 max_count = 3 @@ -357,6 +353,5 @@ variable "gpu_pools" { accelerator_count = 2 accelerator_type = "nvidia-l4" gpu_driver_version = "DEFAULT" - } - ] + }] } diff --git a/cloudbuild.yaml b/cloudbuild.yaml index f33a15a96..d82f1b8ab 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -59,7 +59,7 @@ steps: -var-file=tfvars_tests/standard-gke-public.platform.tfvars \ -var=project_id=$PROJECT_ID \ -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \ - -var=cluster_region=$_REGION \ + -var=cluster_location=$_REGION \ -auto-approve -no-color -lock=false echo "pass" > /workspace/gke_cluster_result.txt dir: 'infrastructure/' @@ -300,7 +300,7 @@ steps: cd /workspace/infrastructure terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \ -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \ - -var=cluster_region=$_REGION -auto-approve -no-color + -var=cluster_location=$_REGION -auto-approve -no-color allowFailure: true waitFor: ['cleanup rag'] diff --git a/infrastructure/main.tf b/infrastructure/main.tf index e8826c5f9..243281a00 100644 --- a/infrastructure/main.tf +++ b/infrastructure/main.tf @@ -16,6 +16,26 @@ #### PLATFORM ####################################################### +## GPU locations where L4 & T4 are supported. +locals { + gpu_l4_t4_location = { + asia-east1 = "asia-east1-a,asia-east1-c" + asia-northeast1 = "asia-northeast1-a,asia-northeast1-c" + asia-northeast3 = "asia-northeast3-b" + asia-south1 = "asia-south1-a,asia-south1-b" + asia-southeast1 = "asia-southeast1-a,asia-southeast1-b,asia-southeast1-c" + europe-west1 = "europe-west1-b,europe-west1-c" + europe-west2 = "europe-west2-a,europe-west2-b" + europe-west3 = "europe-west3-b" + europe-west4 = "europe-west4-a,europe-west4-b,europe-west4-c" + us-central1 = "us-central1-a,us-central1-b,us-central1-c" + us-east1 = "us-east1-c,us-east1-d" + us-east4 = "us-east4-a,us-east4-c" + us-west1 = "us-west1-a,us-west1-b" + us-west4 = "us-west4-a" + } +} + module "custom-network" { source = "terraform-google-modules/network/google" version = "8.0.0" @@ -40,8 +60,14 @@ module "custom-network" { locals { network_name = var.create_network ? module.custom-network[0].network_name : var.network_name subnetwork_name = var.create_network ? module.custom-network[0].subnets_names[0] : var.subnetwork_name + region = length(split("-", var.cluster_location)) == 2 ? var.cluster_location : "" + regional = local.region != "" ? true : false + zone = length(split("-", var.cluster_location)) > 2 ? split(",", var.cluster_location) : [] + # Update gpu_pools with node_locations according to region and zone gpu availibility, if not provided + gpu_pools = [for elm in var.gpu_pools : (local.regional && contains(keys(local.gpu_l4_t4_location), local.region) && elm["node_locations"] == "") ? merge(elm, { "node_locations" : local.gpu_l4_t4_location[local.region] }) : elm] } + ## create public GKE standard module "public-gke-standard-cluster" { count = var.create_cluster && !var.private_cluster && !var.autopilot_cluster ? 1 : 0 @@ -53,13 +79,13 @@ module "public-gke-standard-cluster" { subnetwork_name = local.subnetwork_name ## gke variables - cluster_regional = var.cluster_regional + cluster_regional = local.regional + cluster_region = local.region + cluster_zones = local.zone cluster_name = var.cluster_name cluster_labels = var.cluster_labels kubernetes_version = var.kubernetes_version release_channel = var.release_channel - cluster_region = var.cluster_region - cluster_zones = var.cluster_zones ip_range_pods = var.ip_range_pods ip_range_services = var.ip_range_services monitoring_enable_managed_prometheus = var.monitoring_enable_managed_prometheus @@ -70,7 +96,7 @@ module "public-gke-standard-cluster" { ## pools config variables cpu_pools = var.cpu_pools enable_gpu = var.enable_gpu - gpu_pools = var.gpu_pools + gpu_pools = local.gpu_pools enable_tpu = var.enable_tpu tpu_pools = var.tpu_pools all_node_pools_oauth_scopes = var.all_node_pools_oauth_scopes @@ -90,13 +116,13 @@ module "public-gke-autopilot-cluster" { subnetwork_name = local.subnetwork_name ## gke variables - cluster_regional = var.cluster_regional + cluster_regional = local.regional + cluster_region = local.region + cluster_zones = local.zone cluster_name = var.cluster_name cluster_labels = var.cluster_labels kubernetes_version = var.kubernetes_version release_channel = var.release_channel - cluster_region = var.cluster_region - cluster_zones = var.cluster_zones ip_range_pods = var.ip_range_pods ip_range_services = var.ip_range_services master_authorized_networks = var.master_authorized_networks @@ -115,13 +141,13 @@ module "private-gke-standard-cluster" { subnetwork_name = local.subnetwork_name ## gke variables - cluster_regional = var.cluster_regional + cluster_regional = local.regional + cluster_region = local.region + cluster_zones = local.zone cluster_name = var.cluster_name cluster_labels = var.cluster_labels kubernetes_version = var.kubernetes_version release_channel = var.release_channel - cluster_region = var.cluster_region - cluster_zones = var.cluster_zones ip_range_pods = var.ip_range_pods ip_range_services = var.ip_range_services monitoring_enable_managed_prometheus = var.monitoring_enable_managed_prometheus @@ -133,7 +159,7 @@ module "private-gke-standard-cluster" { ## pools config variables cpu_pools = var.cpu_pools enable_gpu = var.enable_gpu - gpu_pools = var.gpu_pools + gpu_pools = local.gpu_pools enable_tpu = var.enable_tpu tpu_pools = var.tpu_pools all_node_pools_oauth_scopes = var.all_node_pools_oauth_scopes @@ -153,13 +179,13 @@ module "private-gke-autopilot-cluster" { subnetwork_name = local.subnetwork_name ## gke variables - cluster_regional = var.cluster_regional + cluster_regional = local.regional + cluster_region = local.region + cluster_zones = local.zone cluster_name = var.cluster_name cluster_labels = var.cluster_labels kubernetes_version = var.kubernetes_version release_channel = var.release_channel - cluster_region = var.cluster_region - cluster_zones = var.cluster_zones ip_range_pods = var.ip_range_pods ip_range_services = var.ip_range_services master_authorized_networks = var.master_authorized_networks diff --git a/infrastructure/outputs.tf b/infrastructure/outputs.tf index 9802d7f42..95430633b 100644 --- a/infrastructure/outputs.tf +++ b/infrastructure/outputs.tf @@ -20,8 +20,8 @@ output "cluster_name" { value = var.cluster_name } -output "cluster_region" { - value = var.cluster_region +output "cluster_location" { + value = var.cluster_location } output "endpoint" { diff --git a/infrastructure/platform.tfvars b/infrastructure/platform.tfvars index 5d704bcd5..e3b55a928 100644 --- a/infrastructure/platform.tfvars +++ b/infrastructure/platform.tfvars @@ -20,7 +20,7 @@ project_id = "" ## network values create_network = true network_name = "ml-network" -subnetwork_name = "ml-subnet1" +subnetwork_name = "ml-subnet" subnetwork_cidr = "10.100.0.0/16" subnetwork_region = "us-central1" @@ -28,18 +28,16 @@ subnetwork_region = "us-central1" private_cluster = false ## true = private cluster, false = public cluster autopilot_cluster = false ## true = autopilot cluster, false = standard cluster cluster_name = "ml-cluster" -cluster_region = "us-central1" -cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-c"] +cluster_location = "us-central1" cpu_pools = [{ - name = "cpu-pool" - machine_type = "n1-standard-16" - node_locations = "us-central1-b,us-central1-c" - autoscaling = true - min_count = 1 - max_count = 3 - disk_size_gb = 100 - disk_type = "pd-standard" + name = "cpu-pool" + machine_type = "n1-standard-16" + autoscaling = true + min_count = 1 + max_count = 3 + disk_size_gb = 100 + disk_type = "pd-standard" }] ## make sure required gpu quotas are available in that region @@ -47,7 +45,7 @@ enable_gpu = true gpu_pools = [{ name = "gpu-pool" machine_type = "n1-standard-16" - node_locations = "us-central1-a" + node_locations = "us-central1-a" ## comment to autofill node_location based on cluster_location autoscaling = true min_count = 1 max_count = 3 @@ -60,7 +58,7 @@ gpu_pools = [{ { name = "gpu-pool-l4" machine_type = "g2-standard-24" - node_locations = "us-central1-a" + node_locations = "us-central1-a" ## comment to autofill node_location based on cluster_location autoscaling = true min_count = 1 max_count = 3 diff --git a/infrastructure/tfvars_examples/autopilot-gke-with-existing-network.platform.tfvars b/infrastructure/tfvars_examples/autopilot-gke-with-existing-network.platform.tfvars index 8ba4169e5..851075557 100644 --- a/infrastructure/tfvars_examples/autopilot-gke-with-existing-network.platform.tfvars +++ b/infrastructure/tfvars_examples/autopilot-gke-with-existing-network.platform.tfvars @@ -27,5 +27,4 @@ subnetwork_name = "demo-subnet" private_cluster = true ## Default true. Use false for a public cluster autopilot_cluster = true # false = standard cluster, true = autopilot cluster cluster_name = "demo-cluster" -cluster_region = "us-central1" -cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"] +cluster_location = "us-central1" ## Zonal autopilot clusters are not supported. diff --git a/infrastructure/tfvars_examples/autopilot-gke-with-new-network.platform.tfvars b/infrastructure/tfvars_examples/autopilot-gke-with-new-network.platform.tfvars index df88f0ccf..9b8ef740e 100644 --- a/infrastructure/tfvars_examples/autopilot-gke-with-new-network.platform.tfvars +++ b/infrastructure/tfvars_examples/autopilot-gke-with-new-network.platform.tfvars @@ -30,5 +30,4 @@ subnetwork_region = "us-central1" private_cluster = true ## Default true. Use false for a public cluster autopilot_cluster = true # false = standard cluster, true = autopilot cluster cluster_name = "demo-cluster" -cluster_region = "us-central1" -cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"] +cluster_location = "us-central1" ## Zonal autopilot clusters are not supported. diff --git a/infrastructure/tfvars_examples/platform.complete.tfvars b/infrastructure/tfvars_examples/platform.complete.tfvars index 966435151..a46b2bf50 100644 --- a/infrastructure/tfvars_examples/platform.complete.tfvars +++ b/infrastructure/tfvars_examples/platform.complete.tfvars @@ -47,9 +47,7 @@ private_cluster = true ## Default true. Use false for a pub autopilot_cluster = true # false = standard cluster, true = autopilot cluster cluster_name = "ml-cluster" kubernetes_version = "1.28" -cluster_regional = true -cluster_region = "us-central1" -cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"] +cluster_location = "us-central1" ip_range_pods = "us-central1-01-gke-01-pods-1" ip_range_services = "us-central1-01-gke-01-services-1" monitoring_enable_managed_prometheus = true diff --git a/infrastructure/tfvars_examples/standard-gke-with-exisiting-network.platform.tfvars b/infrastructure/tfvars_examples/standard-gke-with-exisiting-network.platform.tfvars index 38ba50d4a..1cd11c346 100644 --- a/infrastructure/tfvars_examples/standard-gke-with-exisiting-network.platform.tfvars +++ b/infrastructure/tfvars_examples/standard-gke-with-exisiting-network.platform.tfvars @@ -32,18 +32,16 @@ private_cluster = true ## Default true. Use false for a public cluster # }] autopilot_cluster = false # false = standard cluster, true = autopilot cluster cluster_name = "demo-cluster-1" -cluster_region = "us-central1" -cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"] +cluster_location = "us-central1" cpu_pools = [{ - name = "cpu-pool" - machine_type = "n1-standard-16" - node_locations = "us-central1-b,us-central1-c" - autoscaling = true - min_count = 1 - max_count = 3 - disk_size_gb = 100 - disk_type = "pd-standard" + name = "cpu-pool" + machine_type = "n1-standard-16" + autoscaling = true + min_count = 1 + max_count = 3 + disk_size_gb = 100 + disk_type = "pd-standard" }] ## make sure required gpu quotas are available in that region @@ -51,7 +49,6 @@ enable_gpu = true gpu_pools = [{ name = "gpu-pool" machine_type = "n1-standard-16" - node_locations = "us-central1-b,us-central1-c" autoscaling = true min_count = 1 max_count = 3 diff --git a/infrastructure/tfvars_examples/standard-gke-with-new-network.platform.tfvars b/infrastructure/tfvars_examples/standard-gke-with-new-network.platform.tfvars index 365d7fe0c..8fe023d80 100644 --- a/infrastructure/tfvars_examples/standard-gke-with-new-network.platform.tfvars +++ b/infrastructure/tfvars_examples/standard-gke-with-new-network.platform.tfvars @@ -29,18 +29,16 @@ subnetwork_region = "us-central1" private_cluster = false ## Default true. Use false for a public cluster autopilot_cluster = false # false = standard cluster, true = autopilot cluster cluster_name = "demo-cluster-1" -cluster_region = "us-central1" -cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"] +cluster_location = "us-central1-a" cpu_pools = [{ - name = "cpu-pool" - machine_type = "n1-standard-16" - node_locations = "us-central1-b,us-central1-c" - autoscaling = true - min_count = 1 - max_count = 3 - disk_size_gb = 100 - disk_type = "pd-standard" + name = "cpu-pool" + machine_type = "n1-standard-16" + autoscaling = true + min_count = 1 + max_count = 3 + disk_size_gb = 100 + disk_type = "pd-standard" }] ## make sure required gpu quotas are available in that region @@ -48,7 +46,6 @@ enable_gpu = true gpu_pools = [{ name = "gpu-pool" machine_type = "n1-standard-16" - node_locations = "us-central1-b,us-central1-c" autoscaling = true min_count = 1 max_count = 3 diff --git a/infrastructure/tfvars_tests/standard-gke-public.platform.tfvars b/infrastructure/tfvars_tests/standard-gke-public.platform.tfvars index 86d951569..21c8c2bcc 100644 --- a/infrastructure/tfvars_tests/standard-gke-public.platform.tfvars +++ b/infrastructure/tfvars_tests/standard-gke-public.platform.tfvars @@ -31,23 +31,20 @@ subnetwork_name = "default" subnetwork_region = "us-central1" ## gke variables -private_cluster = false ## Default true. Use false for a public cluster -autopilot_cluster = false # false = standard cluster, true = autopilot cluster -cluster_name = "test-cluster" -cluster_region = "us-central1" -cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"] - +private_cluster = false ## Default true. Use false for a public cluster +autopilot_cluster = false # false = standard cluster, true = autopilot cluster +cluster_name = "test-cluster" +cluster_location = "us-central1" gcs_fuse_csi_driver = true cpu_pools = [{ - name = "cpu-pool" - machine_type = "n1-standard-16" - node_locations = "us-central1-b,us-central1-c" - autoscaling = true - min_count = 1 - max_count = 3 - disk_size_gb = 100 - disk_type = "pd-standard" + name = "cpu-pool" + machine_type = "n1-standard-16" + autoscaling = true + min_count = 1 + max_count = 3 + disk_size_gb = 100 + disk_type = "pd-standard" }] ## make sure required gpu quotas are available in the corresponding region diff --git a/infrastructure/variables.tf b/infrastructure/variables.tf index 27c48696a..43224d97b 100644 --- a/infrastructure/variables.tf +++ b/infrastructure/variables.tf @@ -103,14 +103,10 @@ variable "release_channel" { default = "RAPID" } -variable "cluster_region" { +variable "cluster_location" { type = string } -variable "cluster_zones" { - type = list(string) - default = [] -} variable "ip_range_pods" { type = string default = "" @@ -187,7 +183,7 @@ variable "cpu_pools" { type = list(object({ name = string machine_type = string - node_locations = string + node_locations = optional(string, "") autoscaling = optional(bool, false) min_count = optional(number, 1) max_count = optional(number, 3) @@ -207,14 +203,13 @@ variable "cpu_pools" { accelerator_count = optional(number, 0) })) default = [{ - name = "cpu-pool" - machine_type = "n1-standard-16" - node_locations = "us-central1-b,us-central1-c" - autoscaling = true - min_count = 1 - max_count = 3 - disk_size_gb = 100 - disk_type = "pd-standard" + name = "cpu-pool" + machine_type = "n1-standard-16" + autoscaling = true + min_count = 1 + max_count = 3 + disk_size_gb = 100 + disk_type = "pd-standard" }] } @@ -222,7 +217,7 @@ variable "gpu_pools" { type = list(object({ name = string machine_type = string - node_locations = string + node_locations = optional(string, "") autoscaling = optional(bool, false) min_count = optional(number, 1) max_count = optional(number, 3) @@ -246,7 +241,6 @@ variable "gpu_pools" { default = [{ name = "gpu-pool" machine_type = "n1-standard-16" - node_locations = "us-central1-b,us-central1-c" autoscaling = true min_count = 1 max_count = 3 diff --git a/modules/jupyter/main.tf b/modules/jupyter/main.tf index 46c857ffb..0325ebaa3 100644 --- a/modules/jupyter/main.tf +++ b/modules/jupyter/main.tf @@ -152,12 +152,9 @@ resource "helm_release" "jupyterhub" { ephemeral_storage = var.ephemeral_storage }) ] : [templatefile("${path.module}/jupyter_config/config-selfauth.yaml", { - password = var.add_auth ? "dummy" : random_password.generated_password[0].result - project_id = var.project_id - project_number = data.google_project.project.number - - # Support legacy image. - service_id = "" # TODO(umeshkumhar): var.add_auth ? (data.google_compute_backend_service.jupyter-ingress[0].generated_id != null ? data.google_compute_backend_service.jupyter-ingress[0].generated_id : "no-id-yet") : "no-id-yet" + password = var.add_auth ? "dummy" : random_password.generated_password[0].result + project_id = var.project_id + project_number = data.google_project.project.number namespace = var.namespace backend_config = var.k8s_backend_config_name service_name = var.k8s_backend_service_name