Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: use ray addon when creating GKE clusters #781

Merged
merged 2 commits into from
Aug 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 20 additions & 36 deletions applications/rag/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -64,21 +64,24 @@ module "infra" {
source = "../../infrastructure"
count = var.create_cluster ? 1 : 0

project_id = var.project_id
cluster_name = local.cluster_name
cluster_location = var.cluster_location
region = local.cluster_location_region
autopilot_cluster = var.autopilot_cluster
private_cluster = var.private_cluster
create_network = var.create_network
network_name = local.network_name
subnetwork_name = local.network_name
subnetwork_cidr = var.subnetwork_cidr
subnetwork_region = local.cluster_location_region
cpu_pools = var.cpu_pools
enable_gpu = true
gpu_pools = var.gpu_pools
kubernetes_version = var.kubernetes_version
project_id = var.project_id
cluster_name = local.cluster_name
cluster_location = var.cluster_location
region = local.cluster_location_region
autopilot_cluster = var.autopilot_cluster
private_cluster = var.private_cluster
create_network = var.create_network
network_name = local.network_name
subnetwork_name = local.network_name
subnetwork_cidr = var.subnetwork_cidr
subnetwork_region = local.cluster_location_region
cpu_pools = var.cpu_pools
enable_gpu = true
gpu_pools = var.gpu_pools
ray_addon_enabled = true
# TODO(genlu): remove channel and k8s_version after ray addon is in REGULAR channel
release_channel = "RAPID"
kubernetes_version = "1.30.3-gke.1969000"
depends_on = [module.project-services]
}

Expand Down Expand Up @@ -152,16 +155,6 @@ module "namespace" {
namespace = local.kubernetes_namespace
}

module "kuberay-operator" {
source = "../../modules/kuberay-operator"
providers = { helm = helm.rag, kubernetes = kubernetes.rag }
name = "kuberay-operator"
project_id = var.project_id
create_namespace = true
namespace = local.kubernetes_namespace
autopilot_cluster = local.enable_autopilot
}

module "gcs" {
source = "../../modules/gcs"
count = var.create_gcs_bucket ? 1 : 0
Expand Down Expand Up @@ -216,13 +209,6 @@ module "jupyterhub" {
depends_on = [module.namespace, module.gcs]
}

module "kuberay-logging" {
source = "../../modules/kuberay-logging"
providers = { kubernetes = kubernetes.rag }
namespace = local.kubernetes_namespace
depends_on = [module.namespace]
}

module "kuberay-workload-identity" {
providers = { kubernetes = kubernetes.rag }
source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity"
Expand All @@ -245,8 +231,7 @@ module "kuberay-monitoring" {
create_namespace = true
enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard
k8s_service_account = local.ray_service_account
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.namespace, module.kuberay-operator, module.kuberay-workload-identity]
depends_on = [module.namespace, module.kuberay-workload-identity]
}

module "kuberay-cluster" {
Expand Down Expand Up @@ -281,8 +266,7 @@ module "kuberay-cluster" {
k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port
domain = var.ray_dashboard_domain
members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : []
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.gcs, module.kuberay-operator, module.kuberay-workload-identity]
depends_on = [module.gcs, module.kuberay-workload-identity]
}

module "inference-server" {
Expand Down
2 changes: 1 addition & 1 deletion applications/rag/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ variable "cluster_location" {

variable "kubernetes_version" {
type = string
default = "1.28"
default = "1.30"
}

variable "kubernetes_namespace" {
Expand Down
30 changes: 7 additions & 23 deletions applications/ray/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ module "infra" {
cpu_pools = var.cpu_pools
enable_gpu = var.enable_gpu
gpu_pools = var.gpu_pools
depends_on = [module.project-services]
ray_addon_enabled = true
# TODO(genlu): remove channel and k8s_version after ray addon is in REGULAR channel
release_channel = "RAPID"
kubernetes_version = "1.30.3-gke.1969000"
depends_on = [module.project-services]
}

data "google_container_cluster" "default" {
Expand Down Expand Up @@ -147,24 +151,6 @@ module "kuberay-workload-identity" {
depends_on = [module.namespace]
}

module "kuberay-operator" {
source = "../../modules/kuberay-operator"
genlu2011 marked this conversation as resolved.
Show resolved Hide resolved
providers = { helm = helm.ray, kubernetes = kubernetes.ray }
name = "kuberay-operator"
create_namespace = true
namespace = local.kubernetes_namespace
project_id = var.project_id
autopilot_cluster = local.enable_autopilot
}

module "kuberay-logging" {
source = "../../modules/kuberay-logging"
genlu2011 marked this conversation as resolved.
Show resolved Hide resolved
providers = { kubernetes = kubernetes.ray }
namespace = local.kubernetes_namespace

depends_on = [module.namespace]
}

module "kuberay-monitoring" {
genlu2011 marked this conversation as resolved.
Show resolved Hide resolved
count = var.create_ray_cluster ? 1 : 0
source = "../../modules/kuberay-monitoring"
Expand All @@ -175,8 +161,7 @@ module "kuberay-monitoring" {
create_namespace = true
enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard
k8s_service_account = local.workload_identity_service_account
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.kuberay-workload-identity, module.kuberay-operator]
depends_on = [module.kuberay-workload-identity]
}

module "gcs" {
Expand Down Expand Up @@ -216,8 +201,7 @@ module "kuberay-cluster" {
k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port
domain = var.ray_dashboard_domain
members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : []
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.gcs, module.kuberay-operator, module.kuberay-workload-identity]
depends_on = [module.gcs, module.kuberay-workload-identity]
}


Expand Down
8 changes: 5 additions & 3 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,12 @@ steps:
-auto-approve -no-color
echo "pass" > /workspace/user_result.txt

# Make sure pods are running
chmod +x /workspace/scripts/ci/wait_for_pods.sh
/workspace/scripts/ci/wait_for_pods.sh ml-$SHORT_SHA-$_BUILD_ID-ray 3000

kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID-ray --for=condition=Ready --timeout=1200s
# Wait for pods to be stable
sleep 5s
# Ray head's readinessProbe is not probing the head service today. Therefore the wait for ready above is not reliable.
sleep 60s
kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-ray service/ray-cluster-kuberay-head-svc 8265:8265 &
# Wait port-forwarding to take its place
sleep 10s
Expand Down
8 changes: 6 additions & 2 deletions infrastructure/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ module "public-gke-standard-cluster" {
all_node_pools_labels = var.all_node_pools_labels
all_node_pools_metadata = var.all_node_pools_metadata
all_node_pools_tags = var.all_node_pools_tags
ray_addon_enabled = var.ray_addon_enabled
depends_on = [module.custom-network]
}

Expand All @@ -141,8 +142,8 @@ module "public-gke-autopilot-cluster" {
ip_range_services = var.ip_range_services
master_authorized_networks = var.master_authorized_networks
deletion_protection = var.deletion_protection
ray_addon_enabled = var.ray_addon_enabled
depends_on = [module.custom-network]

}

## create private GKE standard
Expand Down Expand Up @@ -170,6 +171,7 @@ module "private-gke-standard-cluster" {
deletion_protection = var.deletion_protection
master_authorized_networks = length(var.master_authorized_networks) == 0 ? [{ cidr_block = "${local.subnetwork_cidr}", display_name = "${local.subnetwork_name}" }] : var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block
ray_addon_enabled = var.ray_addon_enabled

## pools config variables
cpu_pools = var.cpu_pools
Expand Down Expand Up @@ -207,7 +209,9 @@ module "private-gke-autopilot-cluster" {
master_authorized_networks = length(var.master_authorized_networks) == 0 ? [{ cidr_block = "${local.subnetwork_cidr}", display_name = "${local.subnetwork_name}" }] : var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block
deletion_protection = var.deletion_protection
depends_on = [module.custom-network]
ray_addon_enabled = var.ray_addon_enabled

depends_on = [module.custom-network]
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ autopilot_cluster = false # false = standard cluster, true = autopilot cluster
cluster_name = "test-cluster"
cluster_location = "us-east4"
gcs_fuse_csi_driver = true
ray_addon_enabled = true
# TODO(genlu): remove release_channel and kubernetes_version after 1.30.3-gke.1969000 is in REGULAR channel
release_channel = "RAPID"
kubernetes_version = "1.30.3-gke.1969000"

cpu_pools = [{
name = "cpu-pool"
Expand Down
10 changes: 9 additions & 1 deletion infrastructure/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ variable "cluster_labels" {

variable "kubernetes_version" {
type = string
default = "1.28"
default = "1.30"
}

variable "release_channel" {
Expand Down Expand Up @@ -127,6 +127,13 @@ variable "deletion_protection" {
type = bool
default = false
}

variable "ray_addon_enabled" {
type = bool
description = "Set to true to enable ray addon"
default = true
}

variable "master_authorized_networks" {
type = list(object({
cidr_block = string
Expand Down Expand Up @@ -173,6 +180,7 @@ variable "enable_tpu" {
description = "Set to true to create TPU node pool"
default = false
}

variable "enable_gpu" {
type = bool
description = "Set to true to create GPU node pool"
Expand Down
8 changes: 6 additions & 2 deletions modules/gke-autopilot-private-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

module "gke" {
source = "terraform-google-modules/kubernetes-engine/google//modules/beta-autopilot-private-cluster"
version = "29.0.0"
version = "32.0.1"
project_id = var.project_id
regional = var.cluster_regional
name = var.cluster_name
Expand All @@ -35,7 +35,11 @@ module "gke" {
master_authorized_networks = var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block
deletion_protection = var.deletion_protection

ray_operator_config = {
enabled = var.ray_addon_enabled
logging_enabled = var.ray_addon_enabled
monitoring_enabled = var.ray_addon_enabled
}
}

# GKE cluster fleet registration
Expand Down
6 changes: 6 additions & 0 deletions modules/gke-autopilot-private-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,9 @@ variable "master_ipv4_cidr_block" {
type = string
default = ""
}

variable "ray_addon_enabled" {
description = "Enable ray addon by default"
type = bool
default = true
}
8 changes: 7 additions & 1 deletion modules/gke-autopilot-public-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

module "gke" {
source = "terraform-google-modules/kubernetes-engine/google//modules/beta-autopilot-public-cluster"
version = "29.0.0"
version = "32.0.1"
project_id = var.project_id
regional = var.cluster_regional
name = var.cluster_name
Expand All @@ -29,4 +29,10 @@ module "gke" {
ip_range_services = var.ip_range_services
master_authorized_networks = var.master_authorized_networks
deletion_protection = var.deletion_protection

ray_operator_config = {
enabled = var.ray_addon_enabled
logging_enabled = var.ray_addon_enabled
monitoring_enabled = var.ray_addon_enabled
}
}
6 changes: 6 additions & 0 deletions modules/gke-autopilot-public-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,9 @@ variable "deletion_protection" {
type = bool
default = false
}

variable "ray_addon_enabled" {
description = "Enable ray addon by default"
type = bool
default = true
}
8 changes: 7 additions & 1 deletion modules/gke-standard-private-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ locals {

module "gke" {
source = "terraform-google-modules/kubernetes-engine/google//modules/private-cluster"
version = "29.0.0"
version = "32.0.1"
project_id = var.project_id
regional = var.cluster_regional
name = var.cluster_name
Expand All @@ -45,6 +45,12 @@ module "gke" {
master_authorized_networks = var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block

ray_operator_config = {
enabled = var.ray_addon_enabled
logging_enabled = var.ray_addon_enabled
monitoring_enabled = var.ray_addon_enabled
}

node_pools = local.node_pools

node_pools_oauth_scopes = {
Expand Down
6 changes: 6 additions & 0 deletions modules/gke-standard-private-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,9 @@ variable "datapath_provider" {
type = string
default = "ADVANCED_DATAPATH"
}

variable "ray_addon_enabled" {
description = "Enable ray addon by default"
type = bool
default = true
}
8 changes: 7 additions & 1 deletion modules/gke-standard-public-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ locals {

module "gke" {
source = "terraform-google-modules/kubernetes-engine/google"
version = "29.0.0"
version = "32.0.1"
project_id = var.project_id
regional = var.cluster_regional
name = var.cluster_name
Expand All @@ -40,6 +40,12 @@ module "gke" {
monitoring_enable_managed_prometheus = var.monitoring_enable_managed_prometheus
master_authorized_networks = var.master_authorized_networks

ray_operator_config = {
enabled = var.ray_addon_enabled
logging_enabled = var.ray_addon_enabled
monitoring_enabled = var.ray_addon_enabled
}

node_pools = local.node_pools

node_pools_oauth_scopes = {
Expand Down
6 changes: 6 additions & 0 deletions modules/gke-standard-public-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,9 @@ variable "datapath_provider" {
type = string
default = "ADVANCED_DATAPATH"
}

variable "ray_addon_enabled" {
description = "Enable ray addon by default"
type = bool
default = true
}
Loading
Loading