Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for zonal gke standard cluster #271

Merged
merged 4 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion applications/rag/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ module "infra" {

project_id = var.project_id
cluster_name = var.cluster_name
cluster_region = var.cluster_location
cluster_location = var.cluster_location
autopilot_cluster = var.autopilot_cluster
private_cluster = var.private_cluster
create_network = false
Expand Down
6 changes: 4 additions & 2 deletions applications/rag/metadata.display.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ spec:
title: Cluster Location
section: cluster_details
xGoogleProperty:
type: ET_GCE_LOCATION
type: ET_GCE_REGION
# specified regions have L4 & T4 GPUs https://cloud.google.com/compute/docs/gpus/gpu-regions-zones#view-using-tools
gce_region:
allowlisted_regions: ["asia-east1","asia-northeast1","asia-northeast3","asia-south1","asia-southeast1","europe-west1","europe-west2","europe-west3","europe-west4","us-central1","us-east1","us-east4","us-west1","us-west4"]
umeshkumhar marked this conversation as resolved.
Show resolved Hide resolved
cluster_name:
name: cluster_name
title: Cluster Name
Expand All @@ -43,7 +46,6 @@ spec:
autopilot_cluster:
name: autopilot_cluster
title: Autopilot Cluster
invisible: true
section: cluster_details
create_gcs_bucket:
name: create_gcs_bucket
Expand Down
2 changes: 1 addition & 1 deletion applications/rag/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ spec:
- name: add_auth
description: Enable iap authentication on jupyterhub
varType: bool
defaultValue: true
defaultValue: false
- name: autopilot_cluster
varType: bool
defaultValue: false
Expand Down
47 changes: 21 additions & 26 deletions applications/rag/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ variable "cpu_pools" {
type = list(object({
name = string
machine_type = string
node_locations = string
node_locations = optional(string, "")
autoscaling = optional(bool, false)
min_count = optional(number, 1)
max_count = optional(number, 3)
Expand All @@ -294,22 +294,21 @@ variable "cpu_pools" {
accelerator_count = optional(number, 0)
}))
default = [{
name = "cpu-pool"
machine_type = "n1-standard-16"
node_locations = "us-central1-b,us-central1-c"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
name = "cpu-pool"
machine_type = "n1-standard-16"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
}]
}

variable "gpu_pools" {
type = list(object({
name = string
machine_type = string
node_locations = string
node_locations = optional(string, "")
autoscaling = optional(bool, false)
min_count = optional(number, 1)
max_count = optional(number, 3)
Expand All @@ -330,24 +329,21 @@ variable "gpu_pools" {
accelerator_type = optional(string, "nvidia-tesla-t4")
gpu_driver_version = optional(string, "DEFAULT")
}))
default = [
{
name = "gpu-pool"
machine_type = "n1-standard-16"
node_locations = "us-central1-b,us-central1-c"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
accelerator_count = 2
accelerator_type = "nvidia-tesla-t4"
gpu_driver_version = "DEFAULT"
default = [{
name = "gpu-pool"
machine_type = "n1-standard-16"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
accelerator_count = 2
accelerator_type = "nvidia-tesla-t4"
gpu_driver_version = "DEFAULT"
},
{
name = "gpu-pool-l4"
machine_type = "g2-standard-24"
node_locations = "us-central1-a"
autoscaling = true
min_count = 1
max_count = 3
Expand All @@ -357,6 +353,5 @@ variable "gpu_pools" {
accelerator_count = 2
accelerator_type = "nvidia-l4"
gpu_driver_version = "DEFAULT"
}
]
}]
}
4 changes: 2 additions & 2 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ steps:
-var-file=tfvars_tests/standard-gke-public.platform.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_region=$_REGION \
-var=cluster_location=$_REGION \
-auto-approve -no-color -lock=false
echo "pass" > /workspace/gke_cluster_result.txt
dir: 'infrastructure/'
Expand Down Expand Up @@ -300,7 +300,7 @@ steps:
cd /workspace/infrastructure
terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_region=$_REGION -auto-approve -no-color
-var=cluster_location=$_REGION -auto-approve -no-color

allowFailure: true
waitFor: ['cleanup rag']
Expand Down
54 changes: 40 additions & 14 deletions infrastructure/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,26 @@
#### PLATFORM
#######################################################

## GPU locations where L4 & T4 are supported.
locals {
gpu_l4_t4_location = {
asia-east1 = "asia-east1-a,asia-east1-c"
asia-northeast1 = "asia-northeast1-a,asia-northeast1-c"
asia-northeast3 = "asia-northeast3-b"
asia-south1 = "asia-south1-a,asia-south1-b"
asia-southeast1 = "asia-southeast1-a,asia-southeast1-b,asia-southeast1-c"
europe-west1 = "europe-west1-b,europe-west1-c"
europe-west2 = "europe-west2-a,europe-west2-b"
europe-west3 = "europe-west3-b"
europe-west4 = "europe-west4-a,europe-west4-b,europe-west4-c"
us-central1 = "us-central1-a,us-central1-b,us-central1-c"
us-east1 = "us-east1-c,us-east1-d"
us-east4 = "us-east4-a,us-east4-c"
us-west1 = "us-west1-a,us-west1-b"
us-west4 = "us-west4-a"
}
}

module "custom-network" {
source = "terraform-google-modules/network/google"
version = "8.0.0"
Expand All @@ -40,8 +60,14 @@ module "custom-network" {
locals {
network_name = var.create_network ? module.custom-network[0].network_name : var.network_name
subnetwork_name = var.create_network ? module.custom-network[0].subnets_names[0] : var.subnetwork_name
region = length(split("-", var.cluster_location)) == 2 ? var.cluster_location : ""
regional = local.region != "" ? true : false
zone = length(split("-", var.cluster_location)) > 2 ? split(",", var.cluster_location) : []
# Update gpu_pools with node_locations according to region and zone gpu availibility, if not provided
gpu_pools = [for elm in var.gpu_pools : (local.regional && contains(keys(local.gpu_l4_t4_location), local.region) && elm["node_locations"] == "") ? merge(elm, { "node_locations" : local.gpu_l4_t4_location[local.region] }) : elm]
}


## create public GKE standard
module "public-gke-standard-cluster" {
count = var.create_cluster && !var.private_cluster && !var.autopilot_cluster ? 1 : 0
Expand All @@ -53,13 +79,13 @@ module "public-gke-standard-cluster" {
subnetwork_name = local.subnetwork_name

## gke variables
cluster_regional = var.cluster_regional
cluster_regional = local.regional
imreddy13 marked this conversation as resolved.
Show resolved Hide resolved
cluster_region = local.region
cluster_zones = local.zone
cluster_name = var.cluster_name
cluster_labels = var.cluster_labels
kubernetes_version = var.kubernetes_version
release_channel = var.release_channel
cluster_region = var.cluster_region
cluster_zones = var.cluster_zones
ip_range_pods = var.ip_range_pods
ip_range_services = var.ip_range_services
monitoring_enable_managed_prometheus = var.monitoring_enable_managed_prometheus
Expand All @@ -70,7 +96,7 @@ module "public-gke-standard-cluster" {
## pools config variables
cpu_pools = var.cpu_pools
enable_gpu = var.enable_gpu
gpu_pools = var.gpu_pools
gpu_pools = local.gpu_pools
enable_tpu = var.enable_tpu
tpu_pools = var.tpu_pools
all_node_pools_oauth_scopes = var.all_node_pools_oauth_scopes
Expand All @@ -90,13 +116,13 @@ module "public-gke-autopilot-cluster" {
subnetwork_name = local.subnetwork_name

## gke variables
cluster_regional = var.cluster_regional
cluster_regional = local.regional
cluster_region = local.region
cluster_zones = local.zone
cluster_name = var.cluster_name
cluster_labels = var.cluster_labels
kubernetes_version = var.kubernetes_version
release_channel = var.release_channel
cluster_region = var.cluster_region
cluster_zones = var.cluster_zones
ip_range_pods = var.ip_range_pods
ip_range_services = var.ip_range_services
master_authorized_networks = var.master_authorized_networks
Expand All @@ -115,13 +141,13 @@ module "private-gke-standard-cluster" {
subnetwork_name = local.subnetwork_name

## gke variables
cluster_regional = var.cluster_regional
cluster_regional = local.regional
cluster_region = local.region
cluster_zones = local.zone
cluster_name = var.cluster_name
cluster_labels = var.cluster_labels
kubernetes_version = var.kubernetes_version
release_channel = var.release_channel
cluster_region = var.cluster_region
cluster_zones = var.cluster_zones
ip_range_pods = var.ip_range_pods
ip_range_services = var.ip_range_services
monitoring_enable_managed_prometheus = var.monitoring_enable_managed_prometheus
Expand All @@ -133,7 +159,7 @@ module "private-gke-standard-cluster" {
## pools config variables
cpu_pools = var.cpu_pools
enable_gpu = var.enable_gpu
gpu_pools = var.gpu_pools
gpu_pools = local.gpu_pools
enable_tpu = var.enable_tpu
tpu_pools = var.tpu_pools
all_node_pools_oauth_scopes = var.all_node_pools_oauth_scopes
Expand All @@ -153,13 +179,13 @@ module "private-gke-autopilot-cluster" {
subnetwork_name = local.subnetwork_name

## gke variables
cluster_regional = var.cluster_regional
cluster_regional = local.regional
cluster_region = local.region
cluster_zones = local.zone
cluster_name = var.cluster_name
cluster_labels = var.cluster_labels
kubernetes_version = var.kubernetes_version
release_channel = var.release_channel
cluster_region = var.cluster_region
cluster_zones = var.cluster_zones
ip_range_pods = var.ip_range_pods
ip_range_services = var.ip_range_services
master_authorized_networks = var.master_authorized_networks
Expand Down
4 changes: 2 additions & 2 deletions infrastructure/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ output "cluster_name" {
value = var.cluster_name
}

output "cluster_region" {
value = var.cluster_region
output "cluster_location" {
value = var.cluster_location
}

output "endpoint" {
Expand Down
24 changes: 11 additions & 13 deletions infrastructure/platform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -20,34 +20,32 @@ project_id = "<your project ID>"
## network values
create_network = true
network_name = "ml-network"
subnetwork_name = "ml-subnet1"
subnetwork_name = "ml-subnet"
subnetwork_cidr = "10.100.0.0/16"
subnetwork_region = "us-central1"

## gke variables
private_cluster = false ## true = private cluster, false = public cluster
autopilot_cluster = false ## true = autopilot cluster, false = standard cluster
cluster_name = "ml-cluster"
cluster_region = "us-central1"
cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-c"]
cluster_location = "us-central1"

cpu_pools = [{
name = "cpu-pool"
machine_type = "n1-standard-16"
node_locations = "us-central1-b,us-central1-c"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
name = "cpu-pool"
machine_type = "n1-standard-16"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
}]

## make sure required gpu quotas are available in that region
enable_gpu = true
gpu_pools = [{
name = "gpu-pool"
machine_type = "n1-standard-16"
node_locations = "us-central1-a"
node_locations = "us-central1-a" ## comment to autofill node_location based on cluster_location
autoscaling = true
min_count = 1
max_count = 3
Expand All @@ -60,7 +58,7 @@ gpu_pools = [{
{
name = "gpu-pool-l4"
machine_type = "g2-standard-24"
node_locations = "us-central1-a"
node_locations = "us-central1-a" ## comment to autofill node_location based on cluster_location
autoscaling = true
min_count = 1
max_count = 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,4 @@ subnetwork_name = "demo-subnet"
private_cluster = true ## Default true. Use false for a public cluster
autopilot_cluster = true # false = standard cluster, true = autopilot cluster
cluster_name = "demo-cluster"
cluster_region = "us-central1"
cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"]
cluster_location = "us-central1" ## Zonal autopilot clusters are not supported.
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,4 @@ subnetwork_region = "us-central1"
private_cluster = true ## Default true. Use false for a public cluster
autopilot_cluster = true # false = standard cluster, true = autopilot cluster
cluster_name = "demo-cluster"
cluster_region = "us-central1"
cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"]
cluster_location = "us-central1" ## Zonal autopilot clusters are not supported.
4 changes: 1 addition & 3 deletions infrastructure/tfvars_examples/platform.complete.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,7 @@ private_cluster = true ## Default true. Use false for a pub
autopilot_cluster = true # false = standard cluster, true = autopilot cluster
cluster_name = "ml-cluster"
kubernetes_version = "1.28"
cluster_regional = true
cluster_region = "us-central1"
cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"]
cluster_location = "us-central1"
ip_range_pods = "us-central1-01-gke-01-pods-1"
ip_range_services = "us-central1-01-gke-01-services-1"
monitoring_enable_managed_prometheus = true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,26 +32,23 @@ private_cluster = true ## Default true. Use false for a public cluster
# }]
autopilot_cluster = false # false = standard cluster, true = autopilot cluster
cluster_name = "demo-cluster-1"
cluster_region = "us-central1"
cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"]
cluster_location = "us-central1"

cpu_pools = [{
name = "cpu-pool"
machine_type = "n1-standard-16"
node_locations = "us-central1-b,us-central1-c"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
name = "cpu-pool"
machine_type = "n1-standard-16"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
disk_type = "pd-standard"
}]

## make sure required gpu quotas are available in that region
enable_gpu = true
gpu_pools = [{
name = "gpu-pool"
machine_type = "n1-standard-16"
node_locations = "us-central1-b,us-central1-c"
autoscaling = true
min_count = 1
max_count = 3
Expand Down
Loading
Loading