Skip to content

Commit

Permalink
Add support for zonal gke standard cluster (#271)
Browse files Browse the repository at this point in the history
* support for zonal cluster in infrastructure
  • Loading branch information
umeshkumhar committed Mar 7, 2024
1 parent 6b844bc commit f33cc2b
Show file tree
Hide file tree
Showing 16 changed files with 125 additions and 126 deletions.
2 changes: 1 addition & 1 deletion applications/rag/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ module "infra" {

project_id = var.project_id
cluster_name = var.cluster_name
cluster_region = var.cluster_location
cluster_location = var.cluster_location
autopilot_cluster = var.autopilot_cluster
private_cluster = var.private_cluster
create_network = false
Expand Down
6 changes: 4 additions & 2 deletions applications/rag/metadata.display.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ spec:
title: Cluster Location
section: cluster_details
xGoogleProperty:
type: ET_GCE_LOCATION
type: ET_GCE_REGION
# specified regions have L4 & T4 GPUs https://cloud.google.com/compute/docs/gpus/gpu-regions-zones#view-using-tools
gce_region:
allowlisted_regions: ["asia-east1","asia-northeast1","asia-northeast3","asia-south1","asia-southeast1","europe-west1","europe-west2","europe-west3","europe-west4","us-central1","us-east1","us-east4","us-west1","us-west4"]
cluster_name:
name: cluster_name
title: Cluster Name
Expand All @@ -43,7 +46,6 @@ spec:
autopilot_cluster:
name: autopilot_cluster
title: Autopilot Cluster
invisible: true
section: cluster_details
create_gcs_bucket:
name: create_gcs_bucket
Expand Down
2 changes: 1 addition & 1 deletion applications/rag/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ spec:
- name: add_auth
description: Enable iap authentication on jupyterhub
varType: bool
defaultValue: true
defaultValue: false
- name: autopilot_cluster
varType: bool
defaultValue: false
Expand Down
47 changes: 21 additions & 26 deletions applications/rag/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ variable "cpu_pools" {
type = list(object({
name = string
machine_type = string
node_locations = string
node_locations = optional(string, "")
autoscaling = optional(bool, false)
min_count = optional(number, 1)
max_count = optional(number, 3)
Expand All @@ -294,22 +294,21 @@ variable "cpu_pools" {
accelerator_count = optional(number, 0)
}))
default = [{
name = "cpu-pool"
machine_type = "n1-standard-16"
node_locations = "us-central1-b,us-central1-c"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
name = "cpu-pool"
machine_type = "n1-standard-16"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
}]
}

variable "gpu_pools" {
type = list(object({
name = string
machine_type = string
node_locations = string
node_locations = optional(string, "")
autoscaling = optional(bool, false)
min_count = optional(number, 1)
max_count = optional(number, 3)
Expand All @@ -330,24 +329,21 @@ variable "gpu_pools" {
accelerator_type = optional(string, "nvidia-tesla-t4")
gpu_driver_version = optional(string, "DEFAULT")
}))
default = [
{
name = "gpu-pool"
machine_type = "n1-standard-16"
node_locations = "us-central1-b,us-central1-c"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
accelerator_count = 2
accelerator_type = "nvidia-tesla-t4"
gpu_driver_version = "DEFAULT"
default = [{
name = "gpu-pool"
machine_type = "n1-standard-16"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
accelerator_count = 2
accelerator_type = "nvidia-tesla-t4"
gpu_driver_version = "DEFAULT"
},
{
name = "gpu-pool-l4"
machine_type = "g2-standard-24"
node_locations = "us-central1-a"
autoscaling = true
min_count = 1
max_count = 3
Expand All @@ -357,6 +353,5 @@ variable "gpu_pools" {
accelerator_count = 2
accelerator_type = "nvidia-l4"
gpu_driver_version = "DEFAULT"
}
]
}]
}
4 changes: 2 additions & 2 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ steps:
-var-file=tfvars_tests/standard-gke-public.platform.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_region=$_REGION \
-var=cluster_location=$_REGION \
-auto-approve -no-color -lock=false
echo "pass" > /workspace/gke_cluster_result.txt
dir: 'infrastructure/'
Expand Down Expand Up @@ -300,7 +300,7 @@ steps:
cd /workspace/infrastructure
terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_region=$_REGION -auto-approve -no-color
-var=cluster_location=$_REGION -auto-approve -no-color
allowFailure: true
waitFor: ['cleanup rag']
Expand Down
54 changes: 40 additions & 14 deletions infrastructure/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,26 @@
#### PLATFORM
#######################################################

## GPU locations where L4 & T4 are supported.
locals {
gpu_l4_t4_location = {
asia-east1 = "asia-east1-a,asia-east1-c"
asia-northeast1 = "asia-northeast1-a,asia-northeast1-c"
asia-northeast3 = "asia-northeast3-b"
asia-south1 = "asia-south1-a,asia-south1-b"
asia-southeast1 = "asia-southeast1-a,asia-southeast1-b,asia-southeast1-c"
europe-west1 = "europe-west1-b,europe-west1-c"
europe-west2 = "europe-west2-a,europe-west2-b"
europe-west3 = "europe-west3-b"
europe-west4 = "europe-west4-a,europe-west4-b,europe-west4-c"
us-central1 = "us-central1-a,us-central1-b,us-central1-c"
us-east1 = "us-east1-c,us-east1-d"
us-east4 = "us-east4-a,us-east4-c"
us-west1 = "us-west1-a,us-west1-b"
us-west4 = "us-west4-a"
}
}

module "custom-network" {
source = "terraform-google-modules/network/google"
version = "8.0.0"
Expand All @@ -40,8 +60,14 @@ module "custom-network" {
locals {
network_name = var.create_network ? module.custom-network[0].network_name : var.network_name
subnetwork_name = var.create_network ? module.custom-network[0].subnets_names[0] : var.subnetwork_name
region = length(split("-", var.cluster_location)) == 2 ? var.cluster_location : ""
regional = local.region != "" ? true : false
zone = length(split("-", var.cluster_location)) > 2 ? split(",", var.cluster_location) : []
# Update gpu_pools with node_locations according to region and zone gpu availibility, if not provided
gpu_pools = [for elm in var.gpu_pools : (local.regional && contains(keys(local.gpu_l4_t4_location), local.region) && elm["node_locations"] == "") ? merge(elm, { "node_locations" : local.gpu_l4_t4_location[local.region] }) : elm]
}


## create public GKE standard
module "public-gke-standard-cluster" {
count = var.create_cluster && !var.private_cluster && !var.autopilot_cluster ? 1 : 0
Expand All @@ -53,13 +79,13 @@ module "public-gke-standard-cluster" {
subnetwork_name = local.subnetwork_name

## gke variables
cluster_regional = var.cluster_regional
cluster_regional = local.regional
cluster_region = local.region
cluster_zones = local.zone
cluster_name = var.cluster_name
cluster_labels = var.cluster_labels
kubernetes_version = var.kubernetes_version
release_channel = var.release_channel
cluster_region = var.cluster_region
cluster_zones = var.cluster_zones
ip_range_pods = var.ip_range_pods
ip_range_services = var.ip_range_services
monitoring_enable_managed_prometheus = var.monitoring_enable_managed_prometheus
Expand All @@ -70,7 +96,7 @@ module "public-gke-standard-cluster" {
## pools config variables
cpu_pools = var.cpu_pools
enable_gpu = var.enable_gpu
gpu_pools = var.gpu_pools
gpu_pools = local.gpu_pools
enable_tpu = var.enable_tpu
tpu_pools = var.tpu_pools
all_node_pools_oauth_scopes = var.all_node_pools_oauth_scopes
Expand All @@ -90,13 +116,13 @@ module "public-gke-autopilot-cluster" {
subnetwork_name = local.subnetwork_name

## gke variables
cluster_regional = var.cluster_regional
cluster_regional = local.regional
cluster_region = local.region
cluster_zones = local.zone
cluster_name = var.cluster_name
cluster_labels = var.cluster_labels
kubernetes_version = var.kubernetes_version
release_channel = var.release_channel
cluster_region = var.cluster_region
cluster_zones = var.cluster_zones
ip_range_pods = var.ip_range_pods
ip_range_services = var.ip_range_services
master_authorized_networks = var.master_authorized_networks
Expand All @@ -115,13 +141,13 @@ module "private-gke-standard-cluster" {
subnetwork_name = local.subnetwork_name

## gke variables
cluster_regional = var.cluster_regional
cluster_regional = local.regional
cluster_region = local.region
cluster_zones = local.zone
cluster_name = var.cluster_name
cluster_labels = var.cluster_labels
kubernetes_version = var.kubernetes_version
release_channel = var.release_channel
cluster_region = var.cluster_region
cluster_zones = var.cluster_zones
ip_range_pods = var.ip_range_pods
ip_range_services = var.ip_range_services
monitoring_enable_managed_prometheus = var.monitoring_enable_managed_prometheus
Expand All @@ -133,7 +159,7 @@ module "private-gke-standard-cluster" {
## pools config variables
cpu_pools = var.cpu_pools
enable_gpu = var.enable_gpu
gpu_pools = var.gpu_pools
gpu_pools = local.gpu_pools
enable_tpu = var.enable_tpu
tpu_pools = var.tpu_pools
all_node_pools_oauth_scopes = var.all_node_pools_oauth_scopes
Expand All @@ -153,13 +179,13 @@ module "private-gke-autopilot-cluster" {
subnetwork_name = local.subnetwork_name

## gke variables
cluster_regional = var.cluster_regional
cluster_regional = local.regional
cluster_region = local.region
cluster_zones = local.zone
cluster_name = var.cluster_name
cluster_labels = var.cluster_labels
kubernetes_version = var.kubernetes_version
release_channel = var.release_channel
cluster_region = var.cluster_region
cluster_zones = var.cluster_zones
ip_range_pods = var.ip_range_pods
ip_range_services = var.ip_range_services
master_authorized_networks = var.master_authorized_networks
Expand Down
4 changes: 2 additions & 2 deletions infrastructure/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ output "cluster_name" {
value = var.cluster_name
}

output "cluster_region" {
value = var.cluster_region
output "cluster_location" {
value = var.cluster_location
}

output "endpoint" {
Expand Down
24 changes: 11 additions & 13 deletions infrastructure/platform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -20,34 +20,32 @@ project_id = "<your project ID>"
## network values
create_network = true
network_name = "ml-network"
subnetwork_name = "ml-subnet1"
subnetwork_name = "ml-subnet"
subnetwork_cidr = "10.100.0.0/16"
subnetwork_region = "us-central1"

## gke variables
private_cluster = false ## true = private cluster, false = public cluster
autopilot_cluster = false ## true = autopilot cluster, false = standard cluster
cluster_name = "ml-cluster"
cluster_region = "us-central1"
cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-c"]
cluster_location = "us-central1"

cpu_pools = [{
name = "cpu-pool"
machine_type = "n1-standard-16"
node_locations = "us-central1-b,us-central1-c"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
name = "cpu-pool"
machine_type = "n1-standard-16"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
}]

## make sure required gpu quotas are available in that region
enable_gpu = true
gpu_pools = [{
name = "gpu-pool"
machine_type = "n1-standard-16"
node_locations = "us-central1-a"
node_locations = "us-central1-a" ## comment to autofill node_location based on cluster_location
autoscaling = true
min_count = 1
max_count = 3
Expand All @@ -60,7 +58,7 @@ gpu_pools = [{
{
name = "gpu-pool-l4"
machine_type = "g2-standard-24"
node_locations = "us-central1-a"
node_locations = "us-central1-a" ## comment to autofill node_location based on cluster_location
autoscaling = true
min_count = 1
max_count = 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,4 @@ subnetwork_name = "demo-subnet"
private_cluster = true ## Default true. Use false for a public cluster
autopilot_cluster = true # false = standard cluster, true = autopilot cluster
cluster_name = "demo-cluster"
cluster_region = "us-central1"
cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"]
cluster_location = "us-central1" ## Zonal autopilot clusters are not supported.
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,4 @@ subnetwork_region = "us-central1"
private_cluster = true ## Default true. Use false for a public cluster
autopilot_cluster = true # false = standard cluster, true = autopilot cluster
cluster_name = "demo-cluster"
cluster_region = "us-central1"
cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"]
cluster_location = "us-central1" ## Zonal autopilot clusters are not supported.
4 changes: 1 addition & 3 deletions infrastructure/tfvars_examples/platform.complete.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,7 @@ private_cluster = true ## Default true. Use false for a pub
autopilot_cluster = true # false = standard cluster, true = autopilot cluster
cluster_name = "ml-cluster"
kubernetes_version = "1.28"
cluster_regional = true
cluster_region = "us-central1"
cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"]
cluster_location = "us-central1"
ip_range_pods = "us-central1-01-gke-01-pods-1"
ip_range_services = "us-central1-01-gke-01-services-1"
monitoring_enable_managed_prometheus = true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,26 +32,23 @@ private_cluster = true ## Default true. Use false for a public cluster
# }]
autopilot_cluster = false # false = standard cluster, true = autopilot cluster
cluster_name = "demo-cluster-1"
cluster_region = "us-central1"
cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"]
cluster_location = "us-central1"

cpu_pools = [{
name = "cpu-pool"
machine_type = "n1-standard-16"
node_locations = "us-central1-b,us-central1-c"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
name = "cpu-pool"
machine_type = "n1-standard-16"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
}]

## make sure required gpu quotas are available in that region
enable_gpu = true
gpu_pools = [{
name = "gpu-pool"
machine_type = "n1-standard-16"
node_locations = "us-central1-b,us-central1-c"
autoscaling = true
min_count = 1
max_count = 3
Expand Down
Loading

0 comments on commit f33cc2b

Please sign in to comment.