diff --git a/applications/rag/main.tf b/applications/rag/main.tf index 230b6c3e2..629e607aa 100644 --- a/applications/rag/main.tf +++ b/applications/rag/main.tf @@ -78,8 +78,11 @@ module "infra" { cpu_pools = var.cpu_pools enable_gpu = true gpu_pools = var.gpu_pools - kubernetes_version = var.kubernetes_version - depends_on = [module.project-services] + ray_addon_enabled = true + # TODO(genlu): remove channel and k8s_version after ray addon is in REGULAR channel + release_channel = "RAPID" + kubernetes_version = "1.30.3-gke.1969000" + depends_on = [module.project-services] } data "google_container_cluster" "default" { @@ -152,16 +155,6 @@ module "namespace" { namespace = local.kubernetes_namespace } -module "kuberay-operator" { - source = "../../modules/kuberay-operator" - providers = { helm = helm.rag, kubernetes = kubernetes.rag } - name = "kuberay-operator" - project_id = var.project_id - create_namespace = true - namespace = local.kubernetes_namespace - autopilot_cluster = local.enable_autopilot -} - module "gcs" { source = "../../modules/gcs" count = var.create_gcs_bucket ? 1 : 0 @@ -216,13 +209,6 @@ module "jupyterhub" { depends_on = [module.namespace, module.gcs] } -module "kuberay-logging" { - source = "../../modules/kuberay-logging" - providers = { kubernetes = kubernetes.rag } - namespace = local.kubernetes_namespace - depends_on = [module.namespace] -} - module "kuberay-workload-identity" { providers = { kubernetes = kubernetes.rag } source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity" @@ -245,8 +231,7 @@ module "kuberay-monitoring" { create_namespace = true enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard k8s_service_account = local.ray_service_account - //TODO(genlu): remove the module.kuberay-operator after migrated using ray addon. - depends_on = [module.namespace, module.kuberay-operator, module.kuberay-workload-identity] + depends_on = [module.namespace, module.kuberay-workload-identity] } module "kuberay-cluster" { @@ -281,8 +266,7 @@ module "kuberay-cluster" { k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port domain = var.ray_dashboard_domain members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : [] - //TODO(genlu): remove the module.kuberay-operator after migrated using ray addon. - depends_on = [module.gcs, module.kuberay-operator, module.kuberay-workload-identity] + depends_on = [module.gcs, module.kuberay-workload-identity] } module "inference-server" { diff --git a/applications/rag/variables.tf b/applications/rag/variables.tf index c80f39791..b8cef7bd5 100644 --- a/applications/rag/variables.tf +++ b/applications/rag/variables.tf @@ -33,7 +33,7 @@ variable "cluster_location" { variable "kubernetes_version" { type = string - default = "1.28" + default = "1.30" } variable "kubernetes_namespace" { diff --git a/applications/ray/main.tf b/applications/ray/main.tf index 8f4a5ecef..585490caf 100644 --- a/applications/ray/main.tf +++ b/applications/ray/main.tf @@ -73,7 +73,11 @@ module "infra" { cpu_pools = var.cpu_pools enable_gpu = var.enable_gpu gpu_pools = var.gpu_pools - depends_on = [module.project-services] + ray_addon_enabled = true + # TODO(genlu): remove channel and k8s_version after ray addon is in REGULAR channel + release_channel = "RAPID" + kubernetes_version = "1.30.3-gke.1969000" + depends_on = [module.project-services] } data "google_container_cluster" "default" { @@ -147,24 +151,6 @@ module "kuberay-workload-identity" { depends_on = [module.namespace] } -module "kuberay-operator" { - source = "../../modules/kuberay-operator" - providers = { helm = helm.ray, kubernetes = kubernetes.ray } - name = "kuberay-operator" - create_namespace = true - namespace = local.kubernetes_namespace - project_id = var.project_id - autopilot_cluster = local.enable_autopilot -} - -module "kuberay-logging" { - source = "../../modules/kuberay-logging" - providers = { kubernetes = kubernetes.ray } - namespace = local.kubernetes_namespace - - depends_on = [module.namespace] -} - module "kuberay-monitoring" { count = var.create_ray_cluster ? 1 : 0 source = "../../modules/kuberay-monitoring" @@ -175,8 +161,7 @@ module "kuberay-monitoring" { create_namespace = true enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard k8s_service_account = local.workload_identity_service_account - //TODO(genlu): remove the module.kuberay-operator after migrated using ray addon. - depends_on = [module.kuberay-workload-identity, module.kuberay-operator] + depends_on = [module.kuberay-workload-identity] } module "gcs" { @@ -216,8 +201,7 @@ module "kuberay-cluster" { k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port domain = var.ray_dashboard_domain members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : [] - //TODO(genlu): remove the module.kuberay-operator after migrated using ray addon. - depends_on = [module.gcs, module.kuberay-operator, module.kuberay-workload-identity] + depends_on = [module.gcs, module.kuberay-workload-identity] } diff --git a/infrastructure/main.tf b/infrastructure/main.tf index 64000e87b..1b44cea8c 100644 --- a/infrastructure/main.tf +++ b/infrastructure/main.tf @@ -116,6 +116,7 @@ module "public-gke-standard-cluster" { all_node_pools_labels = var.all_node_pools_labels all_node_pools_metadata = var.all_node_pools_metadata all_node_pools_tags = var.all_node_pools_tags + ray_addon_enabled = var.ray_addon_enabled depends_on = [module.custom-network] } @@ -141,8 +142,8 @@ module "public-gke-autopilot-cluster" { ip_range_services = var.ip_range_services master_authorized_networks = var.master_authorized_networks deletion_protection = var.deletion_protection + ray_addon_enabled = var.ray_addon_enabled depends_on = [module.custom-network] - } ## create private GKE standard @@ -170,6 +171,7 @@ module "private-gke-standard-cluster" { deletion_protection = var.deletion_protection master_authorized_networks = length(var.master_authorized_networks) == 0 ? [{ cidr_block = "${local.subnetwork_cidr}", display_name = "${local.subnetwork_name}" }] : var.master_authorized_networks master_ipv4_cidr_block = var.master_ipv4_cidr_block + ray_addon_enabled = var.ray_addon_enabled ## pools config variables cpu_pools = var.cpu_pools @@ -207,7 +209,9 @@ module "private-gke-autopilot-cluster" { master_authorized_networks = length(var.master_authorized_networks) == 0 ? [{ cidr_block = "${local.subnetwork_cidr}", display_name = "${local.subnetwork_name}" }] : var.master_authorized_networks master_ipv4_cidr_block = var.master_ipv4_cidr_block deletion_protection = var.deletion_protection - depends_on = [module.custom-network] + ray_addon_enabled = var.ray_addon_enabled + + depends_on = [module.custom-network] } diff --git a/infrastructure/variables.tf b/infrastructure/variables.tf index 98593e609..dafc23fda 100644 --- a/infrastructure/variables.tf +++ b/infrastructure/variables.tf @@ -95,7 +95,7 @@ variable "cluster_labels" { variable "kubernetes_version" { type = string - default = "1.28" + default = "1.30" } variable "release_channel" { @@ -127,6 +127,13 @@ variable "deletion_protection" { type = bool default = false } + +variable "ray_addon_enabled" { + type = bool + description = "Set to true to enable ray addon" + default = true +} + variable "master_authorized_networks" { type = list(object({ cidr_block = string @@ -173,6 +180,7 @@ variable "enable_tpu" { description = "Set to true to create TPU node pool" default = false } + variable "enable_gpu" { type = bool description = "Set to true to create GPU node pool" diff --git a/modules/gke-autopilot-private-cluster/main.tf b/modules/gke-autopilot-private-cluster/main.tf index 8a0e9284b..b7ceb277a 100644 --- a/modules/gke-autopilot-private-cluster/main.tf +++ b/modules/gke-autopilot-private-cluster/main.tf @@ -14,7 +14,7 @@ module "gke" { source = "terraform-google-modules/kubernetes-engine/google//modules/beta-autopilot-private-cluster" - version = "29.0.0" + version = "32.0.1" project_id = var.project_id regional = var.cluster_regional name = var.cluster_name @@ -35,7 +35,11 @@ module "gke" { master_authorized_networks = var.master_authorized_networks master_ipv4_cidr_block = var.master_ipv4_cidr_block deletion_protection = var.deletion_protection - + ray_operator_config = { + enabled = var.ray_addon_enabled + logging_enabled = var.ray_addon_enabled + monitoring_enabled = var.ray_addon_enabled + } } # GKE cluster fleet registration diff --git a/modules/gke-autopilot-private-cluster/variables.tf b/modules/gke-autopilot-private-cluster/variables.tf index 4009d186c..19dcb9e9f 100644 --- a/modules/gke-autopilot-private-cluster/variables.tf +++ b/modules/gke-autopilot-private-cluster/variables.tf @@ -84,3 +84,9 @@ variable "master_ipv4_cidr_block" { type = string default = "" } + +variable "ray_addon_enabled" { + description = "Enable ray addon by default" + type = bool + default = true +} diff --git a/modules/gke-autopilot-public-cluster/main.tf b/modules/gke-autopilot-public-cluster/main.tf index d86092c09..6f52c3ff7 100644 --- a/modules/gke-autopilot-public-cluster/main.tf +++ b/modules/gke-autopilot-public-cluster/main.tf @@ -14,7 +14,7 @@ module "gke" { source = "terraform-google-modules/kubernetes-engine/google//modules/beta-autopilot-public-cluster" - version = "29.0.0" + version = "32.0.1" project_id = var.project_id regional = var.cluster_regional name = var.cluster_name @@ -29,4 +29,10 @@ module "gke" { ip_range_services = var.ip_range_services master_authorized_networks = var.master_authorized_networks deletion_protection = var.deletion_protection + + ray_operator_config = { + enabled = var.ray_addon_enabled + logging_enabled = var.ray_addon_enabled + monitoring_enabled = var.ray_addon_enabled + } } diff --git a/modules/gke-autopilot-public-cluster/variables.tf b/modules/gke-autopilot-public-cluster/variables.tf index 549086888..1e9df15c0 100644 --- a/modules/gke-autopilot-public-cluster/variables.tf +++ b/modules/gke-autopilot-public-cluster/variables.tf @@ -79,3 +79,9 @@ variable "deletion_protection" { type = bool default = false } + +variable "ray_addon_enabled" { + description = "Enable ray addon by default" + type = bool + default = true +} diff --git a/modules/gke-standard-private-cluster/main.tf b/modules/gke-standard-private-cluster/main.tf index e76a386cb..dbb58fe70 100644 --- a/modules/gke-standard-private-cluster/main.tf +++ b/modules/gke-standard-private-cluster/main.tf @@ -18,7 +18,7 @@ locals { module "gke" { source = "terraform-google-modules/kubernetes-engine/google//modules/private-cluster" - version = "29.0.0" + version = "32.0.1" project_id = var.project_id regional = var.cluster_regional name = var.cluster_name @@ -45,6 +45,12 @@ module "gke" { master_authorized_networks = var.master_authorized_networks master_ipv4_cidr_block = var.master_ipv4_cidr_block + ray_operator_config = { + enabled = var.ray_addon_enabled + logging_enabled = var.ray_addon_enabled + monitoring_enabled = var.ray_addon_enabled + } + node_pools = local.node_pools node_pools_oauth_scopes = { diff --git a/modules/gke-standard-private-cluster/variables.tf b/modules/gke-standard-private-cluster/variables.tf index e4ad14916..bab97d5f9 100644 --- a/modules/gke-standard-private-cluster/variables.tf +++ b/modules/gke-standard-private-cluster/variables.tf @@ -133,3 +133,9 @@ variable "datapath_provider" { type = string default = "ADVANCED_DATAPATH" } + +variable "ray_addon_enabled" { + description = "Enable ray addon by default" + type = bool + default = true +} diff --git a/modules/gke-standard-public-cluster/main.tf b/modules/gke-standard-public-cluster/main.tf index fb5d49fb8..677d65f49 100644 --- a/modules/gke-standard-public-cluster/main.tf +++ b/modules/gke-standard-public-cluster/main.tf @@ -18,7 +18,7 @@ locals { module "gke" { source = "terraform-google-modules/kubernetes-engine/google" - version = "29.0.0" + version = "32.0.1" project_id = var.project_id regional = var.cluster_regional name = var.cluster_name @@ -40,6 +40,12 @@ module "gke" { monitoring_enable_managed_prometheus = var.monitoring_enable_managed_prometheus master_authorized_networks = var.master_authorized_networks + ray_operator_config = { + enabled = var.ray_addon_enabled + logging_enabled = var.ray_addon_enabled + monitoring_enabled = var.ray_addon_enabled + } + node_pools = local.node_pools node_pools_oauth_scopes = { diff --git a/modules/gke-standard-public-cluster/variables.tf b/modules/gke-standard-public-cluster/variables.tf index 1fd865a19..9e2f242b7 100644 --- a/modules/gke-standard-public-cluster/variables.tf +++ b/modules/gke-standard-public-cluster/variables.tf @@ -128,3 +128,9 @@ variable "datapath_provider" { type = string default = "ADVANCED_DATAPATH" } + +variable "ray_addon_enabled" { + description = "Enable ray addon by default" + type = bool + default = true +} diff --git a/modules/kuberay-cluster/values.yaml b/modules/kuberay-cluster/values.yaml index a1028fd0a..d6e5487c0 100644 --- a/modules/kuberay-cluster/values.yaml +++ b/modules/kuberay-cluster/values.yaml @@ -114,11 +114,6 @@ head: securityContext: ${indent(4, chomp(yamlencode(security_context)))} volumes: - - name: ray-logs - emptyDir: {} - - name: fluentbit-config - configMap: - name: fluentbit-config - name: gcs-fuse-csi-ephemeral csi: driver: gcsfuse.csi.storage.gke.io @@ -132,33 +127,11 @@ head: optional: true # Ray writes logs to /tmp/ray/session_latests/logs volumeMounts: - - mountPath: /tmp/ray - name: ray-logs - name: gcs-fuse-csi-ephemeral mountPath: /data - name: secret-volume mountPath: /etc/secret-volume readOnly: true - # sidecarContainers specifies additional containers to attach to the Ray pod. - # Follows standard K8s container spec. - sidecarContainers: - - name: fluentbit - image: fluent/fluent-bit:1.9.6 - # These resource requests for Fluent Bit should be sufficient in production. - resources: - requests: - cpu: 100m - memory: 128Mi - ephemeral-storage: 2Gi - limits: - cpu: 100m - memory: 128Mi - ephemeral-storage: 2Gi - volumeMounts: - - mountPath: /tmp/ray - name: ray-logs - - mountPath: /fluent-bit/etc/ - name: fluentbit-config worker: # If you want to disable the default workergroup @@ -211,11 +184,6 @@ worker: securityContext: ${indent(4, chomp(yamlencode(security_context)))} volumes: - - name: ray-logs - emptyDir: {} - - name: fluentbit-config - configMap: - name: fluentbit-config - name: gcs-fuse-csi-ephemeral csi: driver: gcsfuse.csi.storage.gke.io @@ -229,33 +197,11 @@ worker: optional: true # Ray writes logs to /tmp/ray/session_latests/logs volumeMounts: - - mountPath: /tmp/ray - name: ray-logs - name: gcs-fuse-csi-ephemeral mountPath: /data - name: secret-volume mountPath: /etc/secret-volume readOnly: true - # sidecarContainers specifies additional containers to attach to the Ray pod. - # Follows standard K8s container spec. - sidecarContainers: - - name: fluentbit - image: fluent/fluent-bit:1.9.6 - # These resource requests for Fluent Bit should be sufficient in production. - resources: - requests: - cpu: 100m - memory: 128Mi - ephemeral-storage: 2Gi - limits: - cpu: 100m - memory: 128Mi - ephemeral-storage: 4Gi - volumeMounts: - - mountPath: /tmp/ray - name: ray-logs - - mountPath: /fluent-bit/etc/ - name: fluentbit-config # The map's key is used as the groupName. # For example, key:small-group in the map below diff --git a/modules/kuberay-monitoring/main.tf b/modules/kuberay-monitoring/main.tf index 91d31f268..59a68e359 100644 --- a/modules/kuberay-monitoring/main.tf +++ b/modules/kuberay-monitoring/main.tf @@ -12,13 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Temporary workaround to ensure the GMP webhook is installed before applying PodMonitorings. -# After migrated to use ray add-on, this can be removed. -resource "time_sleep" "wait_for_gmp_operator" { - create_duration = "60s" -} - -# google managed prometheus engine +# create frontend service for google managed prometheus engine resource "helm_release" "gmp-ray-monitoring" { name = "gmp-ray-monitoring" chart = "${path.module}/../../charts/gmp-engine/" @@ -26,9 +20,6 @@ resource "helm_release" "gmp-ray-monitoring" { create_namespace = var.create_namespace # Timeout is increased to guarantee sufficient scale-up time for Autopilot nodes. timeout = 1200 - values = [ - "${file("${path.module}/gmpvalues.yaml")}" - ] set { name = "gmp-frontend.projectID" value = var.project_id @@ -37,7 +28,6 @@ resource "helm_release" "gmp-ray-monitoring" { name = "gmp-frontend.serviceAccount" value = var.k8s_service_account } - depends_on = [time_sleep.wait_for_gmp_operator] } # grafana