Skip to content

Commit

Permalink
fix: use ray addon when creating GKE clusters
Browse files Browse the repository at this point in the history
Change-Id: Iac49ae3e2d57701754a50cbfc10b9bba70829f41
  • Loading branch information
Gen Lu committed Aug 29, 2024
1 parent b891db1 commit d15b816
Show file tree
Hide file tree
Showing 15 changed files with 82 additions and 120 deletions.
30 changes: 7 additions & 23 deletions applications/rag/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,11 @@ module "infra" {
cpu_pools = var.cpu_pools
enable_gpu = true
gpu_pools = var.gpu_pools
kubernetes_version = var.kubernetes_version
depends_on = [module.project-services]
ray_addon_enabled = true
# TODO(genlu): remove channel and k8s_version after ray addon is in REGULAR channel
release_channel = "RAPID"
kubernetes_version = "1.30.3-gke.1969000"
depends_on = [module.project-services]
}

data "google_container_cluster" "default" {
Expand Down Expand Up @@ -152,16 +155,6 @@ module "namespace" {
namespace = local.kubernetes_namespace
}

module "kuberay-operator" {
source = "../../modules/kuberay-operator"
providers = { helm = helm.rag, kubernetes = kubernetes.rag }
name = "kuberay-operator"
project_id = var.project_id
create_namespace = true
namespace = local.kubernetes_namespace
autopilot_cluster = local.enable_autopilot
}

module "gcs" {
source = "../../modules/gcs"
count = var.create_gcs_bucket ? 1 : 0
Expand Down Expand Up @@ -216,13 +209,6 @@ module "jupyterhub" {
depends_on = [module.namespace, module.gcs]
}

module "kuberay-logging" {
source = "../../modules/kuberay-logging"
providers = { kubernetes = kubernetes.rag }
namespace = local.kubernetes_namespace
depends_on = [module.namespace]
}

module "kuberay-workload-identity" {
providers = { kubernetes = kubernetes.rag }
source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity"
Expand All @@ -245,8 +231,7 @@ module "kuberay-monitoring" {
create_namespace = true
enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard
k8s_service_account = local.ray_service_account
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.namespace, module.kuberay-operator, module.kuberay-workload-identity]
depends_on = [module.namespace, module.kuberay-workload-identity]
}

module "kuberay-cluster" {
Expand Down Expand Up @@ -281,8 +266,7 @@ module "kuberay-cluster" {
k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port
domain = var.ray_dashboard_domain
members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : []
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.gcs, module.kuberay-operator, module.kuberay-workload-identity]
depends_on = [module.gcs, module.kuberay-workload-identity]
}

module "inference-server" {
Expand Down
2 changes: 1 addition & 1 deletion applications/rag/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ variable "cluster_location" {

variable "kubernetes_version" {
type = string
default = "1.28"
default = "1.30"
}

variable "kubernetes_namespace" {
Expand Down
30 changes: 7 additions & 23 deletions applications/ray/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ module "infra" {
cpu_pools = var.cpu_pools
enable_gpu = var.enable_gpu
gpu_pools = var.gpu_pools
depends_on = [module.project-services]
ray_addon_enabled = true
# TODO(genlu): remove channel and k8s_version after ray addon is in REGULAR channel
release_channel = "RAPID"
kubernetes_version = "1.30.3-gke.1969000"
depends_on = [module.project-services]
}

data "google_container_cluster" "default" {
Expand Down Expand Up @@ -147,24 +151,6 @@ module "kuberay-workload-identity" {
depends_on = [module.namespace]
}

module "kuberay-operator" {
source = "../../modules/kuberay-operator"
providers = { helm = helm.ray, kubernetes = kubernetes.ray }
name = "kuberay-operator"
create_namespace = true
namespace = local.kubernetes_namespace
project_id = var.project_id
autopilot_cluster = local.enable_autopilot
}

module "kuberay-logging" {
source = "../../modules/kuberay-logging"
providers = { kubernetes = kubernetes.ray }
namespace = local.kubernetes_namespace

depends_on = [module.namespace]
}

module "kuberay-monitoring" {
count = var.create_ray_cluster ? 1 : 0
source = "../../modules/kuberay-monitoring"
Expand All @@ -175,8 +161,7 @@ module "kuberay-monitoring" {
create_namespace = true
enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard
k8s_service_account = local.workload_identity_service_account
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.kuberay-workload-identity, module.kuberay-operator]
depends_on = [module.kuberay-workload-identity]
}

module "gcs" {
Expand Down Expand Up @@ -216,8 +201,7 @@ module "kuberay-cluster" {
k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port
domain = var.ray_dashboard_domain
members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : []
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.gcs, module.kuberay-operator, module.kuberay-workload-identity]
depends_on = [module.gcs, module.kuberay-workload-identity]
}


Expand Down
8 changes: 6 additions & 2 deletions infrastructure/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ module "public-gke-standard-cluster" {
all_node_pools_labels = var.all_node_pools_labels
all_node_pools_metadata = var.all_node_pools_metadata
all_node_pools_tags = var.all_node_pools_tags
ray_addon_enabled = var.ray_addon_enabled
depends_on = [module.custom-network]
}

Expand All @@ -141,8 +142,8 @@ module "public-gke-autopilot-cluster" {
ip_range_services = var.ip_range_services
master_authorized_networks = var.master_authorized_networks
deletion_protection = var.deletion_protection
ray_addon_enabled = var.ray_addon_enabled
depends_on = [module.custom-network]

}

## create private GKE standard
Expand Down Expand Up @@ -170,6 +171,7 @@ module "private-gke-standard-cluster" {
deletion_protection = var.deletion_protection
master_authorized_networks = length(var.master_authorized_networks) == 0 ? [{ cidr_block = "${local.subnetwork_cidr}", display_name = "${local.subnetwork_name}" }] : var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block
ray_addon_enabled = var.ray_addon_enabled

## pools config variables
cpu_pools = var.cpu_pools
Expand Down Expand Up @@ -207,7 +209,9 @@ module "private-gke-autopilot-cluster" {
master_authorized_networks = length(var.master_authorized_networks) == 0 ? [{ cidr_block = "${local.subnetwork_cidr}", display_name = "${local.subnetwork_name}" }] : var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block
deletion_protection = var.deletion_protection
depends_on = [module.custom-network]
ray_addon_enabled = var.ray_addon_enabled

depends_on = [module.custom-network]
}


Expand Down
10 changes: 9 additions & 1 deletion infrastructure/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ variable "cluster_labels" {

variable "kubernetes_version" {
type = string
default = "1.28"
default = "1.30"
}

variable "release_channel" {
Expand Down Expand Up @@ -127,6 +127,13 @@ variable "deletion_protection" {
type = bool
default = false
}

variable "ray_addon_enabled" {
type = bool
description = "Set to true to enable ray addon"
default = true
}

variable "master_authorized_networks" {
type = list(object({
cidr_block = string
Expand Down Expand Up @@ -173,6 +180,7 @@ variable "enable_tpu" {
description = "Set to true to create TPU node pool"
default = false
}

variable "enable_gpu" {
type = bool
description = "Set to true to create GPU node pool"
Expand Down
8 changes: 6 additions & 2 deletions modules/gke-autopilot-private-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

module "gke" {
source = "terraform-google-modules/kubernetes-engine/google//modules/beta-autopilot-private-cluster"
version = "29.0.0"
version = "32.0.1"
project_id = var.project_id
regional = var.cluster_regional
name = var.cluster_name
Expand All @@ -35,7 +35,11 @@ module "gke" {
master_authorized_networks = var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block
deletion_protection = var.deletion_protection

ray_operator_config = {
enabled = var.ray_addon_enabled
logging_enabled = var.ray_addon_enabled
monitoring_enabled = var.ray_addon_enabled
}
}

# GKE cluster fleet registration
Expand Down
6 changes: 6 additions & 0 deletions modules/gke-autopilot-private-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,9 @@ variable "master_ipv4_cidr_block" {
type = string
default = ""
}

variable "ray_addon_enabled" {
description = "Enable ray addon by default"
type = bool
default = true
}
8 changes: 7 additions & 1 deletion modules/gke-autopilot-public-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

module "gke" {
source = "terraform-google-modules/kubernetes-engine/google//modules/beta-autopilot-public-cluster"
version = "29.0.0"
version = "32.0.1"
project_id = var.project_id
regional = var.cluster_regional
name = var.cluster_name
Expand All @@ -29,4 +29,10 @@ module "gke" {
ip_range_services = var.ip_range_services
master_authorized_networks = var.master_authorized_networks
deletion_protection = var.deletion_protection

ray_operator_config = {
enabled = var.ray_addon_enabled
logging_enabled = var.ray_addon_enabled
monitoring_enabled = var.ray_addon_enabled
}
}
6 changes: 6 additions & 0 deletions modules/gke-autopilot-public-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,9 @@ variable "deletion_protection" {
type = bool
default = false
}

variable "ray_addon_enabled" {
description = "Enable ray addon by default"
type = bool
default = true
}
8 changes: 7 additions & 1 deletion modules/gke-standard-private-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ locals {

module "gke" {
source = "terraform-google-modules/kubernetes-engine/google//modules/private-cluster"
version = "29.0.0"
version = "32.0.1"
project_id = var.project_id
regional = var.cluster_regional
name = var.cluster_name
Expand All @@ -45,6 +45,12 @@ module "gke" {
master_authorized_networks = var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block

ray_operator_config = {
enabled = var.ray_addon_enabled
logging_enabled = var.ray_addon_enabled
monitoring_enabled = var.ray_addon_enabled
}

node_pools = local.node_pools

node_pools_oauth_scopes = {
Expand Down
6 changes: 6 additions & 0 deletions modules/gke-standard-private-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,9 @@ variable "datapath_provider" {
type = string
default = "ADVANCED_DATAPATH"
}

variable "ray_addon_enabled" {
description = "Enable ray addon by default"
type = bool
default = true
}
8 changes: 7 additions & 1 deletion modules/gke-standard-public-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ locals {

module "gke" {
source = "terraform-google-modules/kubernetes-engine/google"
version = "29.0.0"
version = "32.0.1"
project_id = var.project_id
regional = var.cluster_regional
name = var.cluster_name
Expand All @@ -40,6 +40,12 @@ module "gke" {
monitoring_enable_managed_prometheus = var.monitoring_enable_managed_prometheus
master_authorized_networks = var.master_authorized_networks

ray_operator_config = {
enabled = var.ray_addon_enabled
logging_enabled = var.ray_addon_enabled
monitoring_enabled = var.ray_addon_enabled
}

node_pools = local.node_pools

node_pools_oauth_scopes = {
Expand Down
6 changes: 6 additions & 0 deletions modules/gke-standard-public-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,9 @@ variable "datapath_provider" {
type = string
default = "ADVANCED_DATAPATH"
}

variable "ray_addon_enabled" {
description = "Enable ray addon by default"
type = bool
default = true
}
Loading

0 comments on commit d15b816

Please sign in to comment.