From 1d8ccc30c2fe511d863a56184d5e90d7bdef5f93 Mon Sep 17 00:00:00 2001 From: genlu2011 Date: Fri, 30 Aug 2024 21:35:01 -0700 Subject: [PATCH] fix: use ray addon when creating GKE clusters (#781) * fix: use ray addon when creating GKE clusters Change-Id: Iac49ae3e2d57701754a50cbfc10b9bba70829f41 * delete kuberay-logging and kuberay-operator module Change-Id: I2b8eda653bcf4b1d1cd402111e7bd79d18eaf1ac --------- Co-authored-by: Gen Lu --- applications/rag/main.tf | 56 +++----- applications/rag/variables.tf | 2 +- applications/ray/main.tf | 30 +---- cloudbuild.yaml | 8 +- infrastructure/main.tf | 8 +- .../standard-gke-public.platform.tfvars | 4 + infrastructure/variables.tf | 10 +- modules/gke-autopilot-private-cluster/main.tf | 8 +- .../variables.tf | 6 + modules/gke-autopilot-public-cluster/main.tf | 8 +- .../gke-autopilot-public-cluster/variables.tf | 6 + modules/gke-standard-private-cluster/main.tf | 8 +- .../gke-standard-private-cluster/variables.tf | 6 + modules/gke-standard-public-cluster/main.tf | 8 +- .../gke-standard-public-cluster/variables.tf | 6 + modules/kuberay-cluster/values.yaml | 54 -------- .../kuberay-logging/config/fluent-bit.conf | 18 --- modules/kuberay-logging/config/parsers.conf | 4 - modules/kuberay-logging/kubernetes.tf | 0 modules/kuberay-logging/main.tf | 26 ---- modules/kuberay-logging/variables.tf | 19 --- modules/kuberay-logging/versions.tf | 33 ----- modules/kuberay-monitoring/main.tf | 12 +- .../kuberay-operator-autopilot-values.yaml | 121 ------------------ .../kuberay-operator-values.yaml | 121 ------------------ modules/kuberay-operator/kuberay.tf | 63 --------- modules/kuberay-operator/variables.tf | 36 ------ modules/kuberay-operator/versions.tf | 29 ----- scripts/ci/wait_for_pods.sh | 35 +++++ 29 files changed, 139 insertions(+), 606 deletions(-) delete mode 100644 modules/kuberay-logging/config/fluent-bit.conf delete mode 100644 modules/kuberay-logging/config/parsers.conf delete mode 100644 modules/kuberay-logging/kubernetes.tf delete mode 100644 modules/kuberay-logging/main.tf delete mode 100644 modules/kuberay-logging/variables.tf delete mode 100644 modules/kuberay-logging/versions.tf delete mode 100644 modules/kuberay-operator/kuberay-operator-autopilot-values.yaml delete mode 100644 modules/kuberay-operator/kuberay-operator-values.yaml delete mode 100644 modules/kuberay-operator/kuberay.tf delete mode 100644 modules/kuberay-operator/variables.tf delete mode 100644 modules/kuberay-operator/versions.tf create mode 100755 scripts/ci/wait_for_pods.sh diff --git a/applications/rag/main.tf b/applications/rag/main.tf index 230b6c3e2..90c3c44e5 100644 --- a/applications/rag/main.tf +++ b/applications/rag/main.tf @@ -64,21 +64,24 @@ module "infra" { source = "../../infrastructure" count = var.create_cluster ? 1 : 0 - project_id = var.project_id - cluster_name = local.cluster_name - cluster_location = var.cluster_location - region = local.cluster_location_region - autopilot_cluster = var.autopilot_cluster - private_cluster = var.private_cluster - create_network = var.create_network - network_name = local.network_name - subnetwork_name = local.network_name - subnetwork_cidr = var.subnetwork_cidr - subnetwork_region = local.cluster_location_region - cpu_pools = var.cpu_pools - enable_gpu = true - gpu_pools = var.gpu_pools - kubernetes_version = var.kubernetes_version + project_id = var.project_id + cluster_name = local.cluster_name + cluster_location = var.cluster_location + region = local.cluster_location_region + autopilot_cluster = var.autopilot_cluster + private_cluster = var.private_cluster + create_network = var.create_network + network_name = local.network_name + subnetwork_name = local.network_name + subnetwork_cidr = var.subnetwork_cidr + subnetwork_region = local.cluster_location_region + cpu_pools = var.cpu_pools + enable_gpu = true + gpu_pools = var.gpu_pools + ray_addon_enabled = true + # TODO(genlu): remove channel and k8s_version after ray addon is in REGULAR channel + release_channel = "RAPID" + kubernetes_version = "1.30.3-gke.1969000" depends_on = [module.project-services] } @@ -152,16 +155,6 @@ module "namespace" { namespace = local.kubernetes_namespace } -module "kuberay-operator" { - source = "../../modules/kuberay-operator" - providers = { helm = helm.rag, kubernetes = kubernetes.rag } - name = "kuberay-operator" - project_id = var.project_id - create_namespace = true - namespace = local.kubernetes_namespace - autopilot_cluster = local.enable_autopilot -} - module "gcs" { source = "../../modules/gcs" count = var.create_gcs_bucket ? 1 : 0 @@ -216,13 +209,6 @@ module "jupyterhub" { depends_on = [module.namespace, module.gcs] } -module "kuberay-logging" { - source = "../../modules/kuberay-logging" - providers = { kubernetes = kubernetes.rag } - namespace = local.kubernetes_namespace - depends_on = [module.namespace] -} - module "kuberay-workload-identity" { providers = { kubernetes = kubernetes.rag } source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity" @@ -245,8 +231,7 @@ module "kuberay-monitoring" { create_namespace = true enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard k8s_service_account = local.ray_service_account - //TODO(genlu): remove the module.kuberay-operator after migrated using ray addon. - depends_on = [module.namespace, module.kuberay-operator, module.kuberay-workload-identity] + depends_on = [module.namespace, module.kuberay-workload-identity] } module "kuberay-cluster" { @@ -281,8 +266,7 @@ module "kuberay-cluster" { k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port domain = var.ray_dashboard_domain members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : [] - //TODO(genlu): remove the module.kuberay-operator after migrated using ray addon. - depends_on = [module.gcs, module.kuberay-operator, module.kuberay-workload-identity] + depends_on = [module.gcs, module.kuberay-workload-identity] } module "inference-server" { diff --git a/applications/rag/variables.tf b/applications/rag/variables.tf index c80f39791..b8cef7bd5 100644 --- a/applications/rag/variables.tf +++ b/applications/rag/variables.tf @@ -33,7 +33,7 @@ variable "cluster_location" { variable "kubernetes_version" { type = string - default = "1.28" + default = "1.30" } variable "kubernetes_namespace" { diff --git a/applications/ray/main.tf b/applications/ray/main.tf index 8f4a5ecef..585490caf 100644 --- a/applications/ray/main.tf +++ b/applications/ray/main.tf @@ -73,7 +73,11 @@ module "infra" { cpu_pools = var.cpu_pools enable_gpu = var.enable_gpu gpu_pools = var.gpu_pools - depends_on = [module.project-services] + ray_addon_enabled = true + # TODO(genlu): remove channel and k8s_version after ray addon is in REGULAR channel + release_channel = "RAPID" + kubernetes_version = "1.30.3-gke.1969000" + depends_on = [module.project-services] } data "google_container_cluster" "default" { @@ -147,24 +151,6 @@ module "kuberay-workload-identity" { depends_on = [module.namespace] } -module "kuberay-operator" { - source = "../../modules/kuberay-operator" - providers = { helm = helm.ray, kubernetes = kubernetes.ray } - name = "kuberay-operator" - create_namespace = true - namespace = local.kubernetes_namespace - project_id = var.project_id - autopilot_cluster = local.enable_autopilot -} - -module "kuberay-logging" { - source = "../../modules/kuberay-logging" - providers = { kubernetes = kubernetes.ray } - namespace = local.kubernetes_namespace - - depends_on = [module.namespace] -} - module "kuberay-monitoring" { count = var.create_ray_cluster ? 1 : 0 source = "../../modules/kuberay-monitoring" @@ -175,8 +161,7 @@ module "kuberay-monitoring" { create_namespace = true enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard k8s_service_account = local.workload_identity_service_account - //TODO(genlu): remove the module.kuberay-operator after migrated using ray addon. - depends_on = [module.kuberay-workload-identity, module.kuberay-operator] + depends_on = [module.kuberay-workload-identity] } module "gcs" { @@ -216,8 +201,7 @@ module "kuberay-cluster" { k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port domain = var.ray_dashboard_domain members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : [] - //TODO(genlu): remove the module.kuberay-operator after migrated using ray addon. - depends_on = [module.gcs, module.kuberay-operator, module.kuberay-workload-identity] + depends_on = [module.gcs, module.kuberay-workload-identity] } diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 47f4f4765..9dab3c3fc 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -100,10 +100,12 @@ steps: -auto-approve -no-color echo "pass" > /workspace/user_result.txt - # Make sure pods are running + chmod +x /workspace/scripts/ci/wait_for_pods.sh + /workspace/scripts/ci/wait_for_pods.sh ml-$SHORT_SHA-$_BUILD_ID-ray 3000 + kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID-ray --for=condition=Ready --timeout=1200s - # Wait for pods to be stable - sleep 5s + # Ray head's readinessProbe is not probing the head service today. Therefore the wait for ready above is not reliable. + sleep 60s kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-ray service/ray-cluster-kuberay-head-svc 8265:8265 & # Wait port-forwarding to take its place sleep 10s diff --git a/infrastructure/main.tf b/infrastructure/main.tf index 64000e87b..1b44cea8c 100644 --- a/infrastructure/main.tf +++ b/infrastructure/main.tf @@ -116,6 +116,7 @@ module "public-gke-standard-cluster" { all_node_pools_labels = var.all_node_pools_labels all_node_pools_metadata = var.all_node_pools_metadata all_node_pools_tags = var.all_node_pools_tags + ray_addon_enabled = var.ray_addon_enabled depends_on = [module.custom-network] } @@ -141,8 +142,8 @@ module "public-gke-autopilot-cluster" { ip_range_services = var.ip_range_services master_authorized_networks = var.master_authorized_networks deletion_protection = var.deletion_protection + ray_addon_enabled = var.ray_addon_enabled depends_on = [module.custom-network] - } ## create private GKE standard @@ -170,6 +171,7 @@ module "private-gke-standard-cluster" { deletion_protection = var.deletion_protection master_authorized_networks = length(var.master_authorized_networks) == 0 ? [{ cidr_block = "${local.subnetwork_cidr}", display_name = "${local.subnetwork_name}" }] : var.master_authorized_networks master_ipv4_cidr_block = var.master_ipv4_cidr_block + ray_addon_enabled = var.ray_addon_enabled ## pools config variables cpu_pools = var.cpu_pools @@ -207,7 +209,9 @@ module "private-gke-autopilot-cluster" { master_authorized_networks = length(var.master_authorized_networks) == 0 ? [{ cidr_block = "${local.subnetwork_cidr}", display_name = "${local.subnetwork_name}" }] : var.master_authorized_networks master_ipv4_cidr_block = var.master_ipv4_cidr_block deletion_protection = var.deletion_protection - depends_on = [module.custom-network] + ray_addon_enabled = var.ray_addon_enabled + + depends_on = [module.custom-network] } diff --git a/infrastructure/tfvars_tests/standard-gke-public.platform.tfvars b/infrastructure/tfvars_tests/standard-gke-public.platform.tfvars index 3a8492e41..999e5b44c 100644 --- a/infrastructure/tfvars_tests/standard-gke-public.platform.tfvars +++ b/infrastructure/tfvars_tests/standard-gke-public.platform.tfvars @@ -36,6 +36,10 @@ autopilot_cluster = false # false = standard cluster, true = autopilot cluster cluster_name = "test-cluster" cluster_location = "us-east4" gcs_fuse_csi_driver = true +ray_addon_enabled = true +# TODO(genlu): remove release_channel and kubernetes_version after 1.30.3-gke.1969000 is in REGULAR channel +release_channel = "RAPID" +kubernetes_version = "1.30.3-gke.1969000" cpu_pools = [{ name = "cpu-pool" diff --git a/infrastructure/variables.tf b/infrastructure/variables.tf index 98593e609..dafc23fda 100644 --- a/infrastructure/variables.tf +++ b/infrastructure/variables.tf @@ -95,7 +95,7 @@ variable "cluster_labels" { variable "kubernetes_version" { type = string - default = "1.28" + default = "1.30" } variable "release_channel" { @@ -127,6 +127,13 @@ variable "deletion_protection" { type = bool default = false } + +variable "ray_addon_enabled" { + type = bool + description = "Set to true to enable ray addon" + default = true +} + variable "master_authorized_networks" { type = list(object({ cidr_block = string @@ -173,6 +180,7 @@ variable "enable_tpu" { description = "Set to true to create TPU node pool" default = false } + variable "enable_gpu" { type = bool description = "Set to true to create GPU node pool" diff --git a/modules/gke-autopilot-private-cluster/main.tf b/modules/gke-autopilot-private-cluster/main.tf index 8a0e9284b..b7ceb277a 100644 --- a/modules/gke-autopilot-private-cluster/main.tf +++ b/modules/gke-autopilot-private-cluster/main.tf @@ -14,7 +14,7 @@ module "gke" { source = "terraform-google-modules/kubernetes-engine/google//modules/beta-autopilot-private-cluster" - version = "29.0.0" + version = "32.0.1" project_id = var.project_id regional = var.cluster_regional name = var.cluster_name @@ -35,7 +35,11 @@ module "gke" { master_authorized_networks = var.master_authorized_networks master_ipv4_cidr_block = var.master_ipv4_cidr_block deletion_protection = var.deletion_protection - + ray_operator_config = { + enabled = var.ray_addon_enabled + logging_enabled = var.ray_addon_enabled + monitoring_enabled = var.ray_addon_enabled + } } # GKE cluster fleet registration diff --git a/modules/gke-autopilot-private-cluster/variables.tf b/modules/gke-autopilot-private-cluster/variables.tf index 4009d186c..19dcb9e9f 100644 --- a/modules/gke-autopilot-private-cluster/variables.tf +++ b/modules/gke-autopilot-private-cluster/variables.tf @@ -84,3 +84,9 @@ variable "master_ipv4_cidr_block" { type = string default = "" } + +variable "ray_addon_enabled" { + description = "Enable ray addon by default" + type = bool + default = true +} diff --git a/modules/gke-autopilot-public-cluster/main.tf b/modules/gke-autopilot-public-cluster/main.tf index d86092c09..6f52c3ff7 100644 --- a/modules/gke-autopilot-public-cluster/main.tf +++ b/modules/gke-autopilot-public-cluster/main.tf @@ -14,7 +14,7 @@ module "gke" { source = "terraform-google-modules/kubernetes-engine/google//modules/beta-autopilot-public-cluster" - version = "29.0.0" + version = "32.0.1" project_id = var.project_id regional = var.cluster_regional name = var.cluster_name @@ -29,4 +29,10 @@ module "gke" { ip_range_services = var.ip_range_services master_authorized_networks = var.master_authorized_networks deletion_protection = var.deletion_protection + + ray_operator_config = { + enabled = var.ray_addon_enabled + logging_enabled = var.ray_addon_enabled + monitoring_enabled = var.ray_addon_enabled + } } diff --git a/modules/gke-autopilot-public-cluster/variables.tf b/modules/gke-autopilot-public-cluster/variables.tf index 549086888..1e9df15c0 100644 --- a/modules/gke-autopilot-public-cluster/variables.tf +++ b/modules/gke-autopilot-public-cluster/variables.tf @@ -79,3 +79,9 @@ variable "deletion_protection" { type = bool default = false } + +variable "ray_addon_enabled" { + description = "Enable ray addon by default" + type = bool + default = true +} diff --git a/modules/gke-standard-private-cluster/main.tf b/modules/gke-standard-private-cluster/main.tf index e76a386cb..dbb58fe70 100644 --- a/modules/gke-standard-private-cluster/main.tf +++ b/modules/gke-standard-private-cluster/main.tf @@ -18,7 +18,7 @@ locals { module "gke" { source = "terraform-google-modules/kubernetes-engine/google//modules/private-cluster" - version = "29.0.0" + version = "32.0.1" project_id = var.project_id regional = var.cluster_regional name = var.cluster_name @@ -45,6 +45,12 @@ module "gke" { master_authorized_networks = var.master_authorized_networks master_ipv4_cidr_block = var.master_ipv4_cidr_block + ray_operator_config = { + enabled = var.ray_addon_enabled + logging_enabled = var.ray_addon_enabled + monitoring_enabled = var.ray_addon_enabled + } + node_pools = local.node_pools node_pools_oauth_scopes = { diff --git a/modules/gke-standard-private-cluster/variables.tf b/modules/gke-standard-private-cluster/variables.tf index e4ad14916..bab97d5f9 100644 --- a/modules/gke-standard-private-cluster/variables.tf +++ b/modules/gke-standard-private-cluster/variables.tf @@ -133,3 +133,9 @@ variable "datapath_provider" { type = string default = "ADVANCED_DATAPATH" } + +variable "ray_addon_enabled" { + description = "Enable ray addon by default" + type = bool + default = true +} diff --git a/modules/gke-standard-public-cluster/main.tf b/modules/gke-standard-public-cluster/main.tf index fb5d49fb8..677d65f49 100644 --- a/modules/gke-standard-public-cluster/main.tf +++ b/modules/gke-standard-public-cluster/main.tf @@ -18,7 +18,7 @@ locals { module "gke" { source = "terraform-google-modules/kubernetes-engine/google" - version = "29.0.0" + version = "32.0.1" project_id = var.project_id regional = var.cluster_regional name = var.cluster_name @@ -40,6 +40,12 @@ module "gke" { monitoring_enable_managed_prometheus = var.monitoring_enable_managed_prometheus master_authorized_networks = var.master_authorized_networks + ray_operator_config = { + enabled = var.ray_addon_enabled + logging_enabled = var.ray_addon_enabled + monitoring_enabled = var.ray_addon_enabled + } + node_pools = local.node_pools node_pools_oauth_scopes = { diff --git a/modules/gke-standard-public-cluster/variables.tf b/modules/gke-standard-public-cluster/variables.tf index 1fd865a19..9e2f242b7 100644 --- a/modules/gke-standard-public-cluster/variables.tf +++ b/modules/gke-standard-public-cluster/variables.tf @@ -128,3 +128,9 @@ variable "datapath_provider" { type = string default = "ADVANCED_DATAPATH" } + +variable "ray_addon_enabled" { + description = "Enable ray addon by default" + type = bool + default = true +} diff --git a/modules/kuberay-cluster/values.yaml b/modules/kuberay-cluster/values.yaml index a1028fd0a..d6e5487c0 100644 --- a/modules/kuberay-cluster/values.yaml +++ b/modules/kuberay-cluster/values.yaml @@ -114,11 +114,6 @@ head: securityContext: ${indent(4, chomp(yamlencode(security_context)))} volumes: - - name: ray-logs - emptyDir: {} - - name: fluentbit-config - configMap: - name: fluentbit-config - name: gcs-fuse-csi-ephemeral csi: driver: gcsfuse.csi.storage.gke.io @@ -132,33 +127,11 @@ head: optional: true # Ray writes logs to /tmp/ray/session_latests/logs volumeMounts: - - mountPath: /tmp/ray - name: ray-logs - name: gcs-fuse-csi-ephemeral mountPath: /data - name: secret-volume mountPath: /etc/secret-volume readOnly: true - # sidecarContainers specifies additional containers to attach to the Ray pod. - # Follows standard K8s container spec. - sidecarContainers: - - name: fluentbit - image: fluent/fluent-bit:1.9.6 - # These resource requests for Fluent Bit should be sufficient in production. - resources: - requests: - cpu: 100m - memory: 128Mi - ephemeral-storage: 2Gi - limits: - cpu: 100m - memory: 128Mi - ephemeral-storage: 2Gi - volumeMounts: - - mountPath: /tmp/ray - name: ray-logs - - mountPath: /fluent-bit/etc/ - name: fluentbit-config worker: # If you want to disable the default workergroup @@ -211,11 +184,6 @@ worker: securityContext: ${indent(4, chomp(yamlencode(security_context)))} volumes: - - name: ray-logs - emptyDir: {} - - name: fluentbit-config - configMap: - name: fluentbit-config - name: gcs-fuse-csi-ephemeral csi: driver: gcsfuse.csi.storage.gke.io @@ -229,33 +197,11 @@ worker: optional: true # Ray writes logs to /tmp/ray/session_latests/logs volumeMounts: - - mountPath: /tmp/ray - name: ray-logs - name: gcs-fuse-csi-ephemeral mountPath: /data - name: secret-volume mountPath: /etc/secret-volume readOnly: true - # sidecarContainers specifies additional containers to attach to the Ray pod. - # Follows standard K8s container spec. - sidecarContainers: - - name: fluentbit - image: fluent/fluent-bit:1.9.6 - # These resource requests for Fluent Bit should be sufficient in production. - resources: - requests: - cpu: 100m - memory: 128Mi - ephemeral-storage: 2Gi - limits: - cpu: 100m - memory: 128Mi - ephemeral-storage: 4Gi - volumeMounts: - - mountPath: /tmp/ray - name: ray-logs - - mountPath: /fluent-bit/etc/ - name: fluentbit-config # The map's key is used as the groupName. # For example, key:small-group in the map below diff --git a/modules/kuberay-logging/config/fluent-bit.conf b/modules/kuberay-logging/config/fluent-bit.conf deleted file mode 100644 index 878ba1f1f..000000000 --- a/modules/kuberay-logging/config/fluent-bit.conf +++ /dev/null @@ -1,18 +0,0 @@ -[SERVICE] - Parsers_File parsers.conf -[INPUT] - Name tail - Path /tmp/ray/session_latest/logs/* - Tag ray - Path_Key filename - Refresh_Interval 5 -[FILTERS] - Name parser - Match ray - Key_Name filename - Parser rayjob - Reserve_Data On -[OUTPUT] - Name stdout - Format json_lines - Match * \ No newline at end of file diff --git a/modules/kuberay-logging/config/parsers.conf b/modules/kuberay-logging/config/parsers.conf deleted file mode 100644 index 3d0e01573..000000000 --- a/modules/kuberay-logging/config/parsers.conf +++ /dev/null @@ -1,4 +0,0 @@ -[PARSER] - Name rayjob - Format regex - Regex job-driver-(?[^.]*)\.log \ No newline at end of file diff --git a/modules/kuberay-logging/kubernetes.tf b/modules/kuberay-logging/kubernetes.tf deleted file mode 100644 index e69de29bb..000000000 diff --git a/modules/kuberay-logging/main.tf b/modules/kuberay-logging/main.tf deleted file mode 100644 index b0a9d2296..000000000 --- a/modules/kuberay-logging/main.tf +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -resource "kubernetes_config_map" "example" { - metadata { - name = "fluentbit-config" - namespace = var.namespace - } - - data = { - "fluent-bit.conf" = "${file("${path.module}/config/fluent-bit.conf")}" - "parsers.conf" = "${file("${path.module}/config/parsers.conf")}" - } - -} diff --git a/modules/kuberay-logging/variables.tf b/modules/kuberay-logging/variables.tf deleted file mode 100644 index 3f2342703..000000000 --- a/modules/kuberay-logging/variables.tf +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -variable "namespace" { - type = string - description = "Kubernetes namespace where resources are deployed" - default = "ray" -} diff --git a/modules/kuberay-logging/versions.tf b/modules/kuberay-logging/versions.tf deleted file mode 100644 index f04476982..000000000 --- a/modules/kuberay-logging/versions.tf +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -terraform { - required_providers { - helm = { - source = "hashicorp/helm" - version = "~> 2.8.0" - } - kubernetes = { - source = "hashicorp/kubernetes" - version = "2.18.1" - } - # kubectl = { - # source = "alekc/kubectl" - # version = "2.0.3" - # } - } - provider_meta "google" { - module_name = "blueprints/terraform/terraform-google-kubernetes-engine:kuberay/v0.1.0" - } -} diff --git a/modules/kuberay-monitoring/main.tf b/modules/kuberay-monitoring/main.tf index 91d31f268..59a68e359 100644 --- a/modules/kuberay-monitoring/main.tf +++ b/modules/kuberay-monitoring/main.tf @@ -12,13 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Temporary workaround to ensure the GMP webhook is installed before applying PodMonitorings. -# After migrated to use ray add-on, this can be removed. -resource "time_sleep" "wait_for_gmp_operator" { - create_duration = "60s" -} - -# google managed prometheus engine +# create frontend service for google managed prometheus engine resource "helm_release" "gmp-ray-monitoring" { name = "gmp-ray-monitoring" chart = "${path.module}/../../charts/gmp-engine/" @@ -26,9 +20,6 @@ resource "helm_release" "gmp-ray-monitoring" { create_namespace = var.create_namespace # Timeout is increased to guarantee sufficient scale-up time for Autopilot nodes. timeout = 1200 - values = [ - "${file("${path.module}/gmpvalues.yaml")}" - ] set { name = "gmp-frontend.projectID" value = var.project_id @@ -37,7 +28,6 @@ resource "helm_release" "gmp-ray-monitoring" { name = "gmp-frontend.serviceAccount" value = var.k8s_service_account } - depends_on = [time_sleep.wait_for_gmp_operator] } # grafana diff --git a/modules/kuberay-operator/kuberay-operator-autopilot-values.yaml b/modules/kuberay-operator/kuberay-operator-autopilot-values.yaml deleted file mode 100644 index ed978c7dd..000000000 --- a/modules/kuberay-operator/kuberay-operator-autopilot-values.yaml +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Default values for kuberay-operator. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -image: - repository: kuberay/operator - tag: v1.0.0 - pullPolicy: IfNotPresent - -nameOverride: "kuberay-operator" -fullnameOverride: "kuberay-operator" - -serviceAccount: - # Specifies whether a service account should be created - create: true - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: "kuberay-operator" - -service: - type: ClusterIP - port: 8080 - -resources: - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do whelm to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - limits: - cpu: 100m - # Anecdotally, managing 500 Ray pods requires roughly 500MB memory. - # Monitor memory usage and adjust as needed. - memory: 512Mi - -livenessProbe: - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 5 - -readinessProbe: - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 5 - -batchScheduler: - enabled: false - -# Set up `securityContext` to improve Pod security. -# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/pod-security.md for further guidance. -securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: true - capabilities: - drop: - - ALL - runAsNonRoot: true - - -# If rbacEnable is set to false, no RBAC resources will be created, including the Role for leader election, the Role for Pods and Services, and so on. -rbacEnable: true - -# When crNamespacedRbacEnable is set to true, the KubeRay operator will create a Role for RayCluster preparation (e.g., Pods, Services) -# and a corresponding RoleBinding for each namespace listed in the "watchNamespace" parameter. Please note that even if crNamespacedRbacEnable -# is set to false, the Role and RoleBinding for leader election will still be created. -# -# Note: -# (1) This variable is only effective when rbacEnable and singleNamespaceInstall are both set to true. -# (2) In most cases, it should be set to true, unless you are using a Kubernetes cluster managed by GitOps tools such as ArgoCD. -crNamespacedRbacEnable: true - -# When singleNamespaceInstall is true: -# - Install namespaced RBAC resources such as Role and RoleBinding instead of cluster-scoped ones like ClusterRole and ClusterRoleBinding so that -# the chart can be installed by users with permissions restricted to a single namespace. -# (Please note that this excludes the CRDs, which can only be installed at the cluster scope.) -# - If "watchNamespace" is not set, the KubeRay operator will, by default, only listen -# to resource events within its own namespace. -singleNamespaceInstall: true - -# The KubeRay operator will watch the custom resources in the namespaces listed in the "watchNamespace" parameter. -# watchNamespace: -# - n1 -# - n2 - -# Environment variables -env: -# If not set or set to true, kuberay auto injects an init container waiting for ray GCS. -# If false, you will need to inject your own init container to ensure ray GCS is up before the ray workers start. -# Warning: we highly recommend setting to true and let kuberay handle for you. -# Note: This is disabled because gcs fuse csi based volumes cannot be mounted in init containers. -- name: ENABLE_INIT_CONTAINER_INJECTION - value: "false" -# If not set or set to "", kuberay will pick up the default k8s cluster domain `cluster.local` -# Otherwise, kuberay will use your custom domain -# - name: CLUSTER_DOMAIN -# value: "" -# If not set or set to false, when running on OpenShift with Ingress creation enabled, kuberay will create OpenShift route -# Otherwise, regardless of the type of cluster with Ingress creation enabled, kuberay will create Ingress -# - name: USE_INGRESS_ON_OPENSHIFT -# value: "true" -# Unconditionally requeue after the number of seconds specified in the -# environment variable RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV. If the -# environment variable is not set, requeue after the default value (300). -# - name: RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV -# value: 300 -# If not set or set to "true", KubeRay will clean up the Redis storage namespace when a GCS FT-enabled RayCluster is deleted. -# - name: ENABLE_GCS_FT_REDIS_CLEANUP -# value: "true" diff --git a/modules/kuberay-operator/kuberay-operator-values.yaml b/modules/kuberay-operator/kuberay-operator-values.yaml deleted file mode 100644 index ed978c7dd..000000000 --- a/modules/kuberay-operator/kuberay-operator-values.yaml +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Default values for kuberay-operator. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -image: - repository: kuberay/operator - tag: v1.0.0 - pullPolicy: IfNotPresent - -nameOverride: "kuberay-operator" -fullnameOverride: "kuberay-operator" - -serviceAccount: - # Specifies whether a service account should be created - create: true - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: "kuberay-operator" - -service: - type: ClusterIP - port: 8080 - -resources: - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do whelm to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - limits: - cpu: 100m - # Anecdotally, managing 500 Ray pods requires roughly 500MB memory. - # Monitor memory usage and adjust as needed. - memory: 512Mi - -livenessProbe: - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 5 - -readinessProbe: - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 5 - -batchScheduler: - enabled: false - -# Set up `securityContext` to improve Pod security. -# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/pod-security.md for further guidance. -securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: true - capabilities: - drop: - - ALL - runAsNonRoot: true - - -# If rbacEnable is set to false, no RBAC resources will be created, including the Role for leader election, the Role for Pods and Services, and so on. -rbacEnable: true - -# When crNamespacedRbacEnable is set to true, the KubeRay operator will create a Role for RayCluster preparation (e.g., Pods, Services) -# and a corresponding RoleBinding for each namespace listed in the "watchNamespace" parameter. Please note that even if crNamespacedRbacEnable -# is set to false, the Role and RoleBinding for leader election will still be created. -# -# Note: -# (1) This variable is only effective when rbacEnable and singleNamespaceInstall are both set to true. -# (2) In most cases, it should be set to true, unless you are using a Kubernetes cluster managed by GitOps tools such as ArgoCD. -crNamespacedRbacEnable: true - -# When singleNamespaceInstall is true: -# - Install namespaced RBAC resources such as Role and RoleBinding instead of cluster-scoped ones like ClusterRole and ClusterRoleBinding so that -# the chart can be installed by users with permissions restricted to a single namespace. -# (Please note that this excludes the CRDs, which can only be installed at the cluster scope.) -# - If "watchNamespace" is not set, the KubeRay operator will, by default, only listen -# to resource events within its own namespace. -singleNamespaceInstall: true - -# The KubeRay operator will watch the custom resources in the namespaces listed in the "watchNamespace" parameter. -# watchNamespace: -# - n1 -# - n2 - -# Environment variables -env: -# If not set or set to true, kuberay auto injects an init container waiting for ray GCS. -# If false, you will need to inject your own init container to ensure ray GCS is up before the ray workers start. -# Warning: we highly recommend setting to true and let kuberay handle for you. -# Note: This is disabled because gcs fuse csi based volumes cannot be mounted in init containers. -- name: ENABLE_INIT_CONTAINER_INJECTION - value: "false" -# If not set or set to "", kuberay will pick up the default k8s cluster domain `cluster.local` -# Otherwise, kuberay will use your custom domain -# - name: CLUSTER_DOMAIN -# value: "" -# If not set or set to false, when running on OpenShift with Ingress creation enabled, kuberay will create OpenShift route -# Otherwise, regardless of the type of cluster with Ingress creation enabled, kuberay will create Ingress -# - name: USE_INGRESS_ON_OPENSHIFT -# value: "true" -# Unconditionally requeue after the number of seconds specified in the -# environment variable RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV. If the -# environment variable is not set, requeue after the default value (300). -# - name: RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV -# value: 300 -# If not set or set to "true", KubeRay will clean up the Redis storage namespace when a GCS FT-enabled RayCluster is deleted. -# - name: ENABLE_GCS_FT_REDIS_CLEANUP -# value: "true" diff --git a/modules/kuberay-operator/kuberay.tf b/modules/kuberay-operator/kuberay.tf deleted file mode 100644 index e754bdf89..000000000 --- a/modules/kuberay-operator/kuberay.tf +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -resource "helm_release" "kuberay-operator" { - name = var.name - repository = "https://ray-project.github.io/kuberay-helm/" - chart = "kuberay-operator" - values = var.autopilot_cluster ? [file("${path.module}/kuberay-operator-autopilot-values.yaml")] : [file("${path.module}/kuberay-operator-values.yaml")] - version = "1.0.0" - namespace = var.namespace - cleanup_on_fail = "true" - create_namespace = var.create_namespace -} - -# Grant access to batchv1/Jobs to kuberay-operator since the kuberay-operator role is missing some permissions. -# See https://github.com/ray-project/kuberay/issues/1706 for more details. -# TODO: remove this role binding once the kuberay-operator helm chart is upgraded to v1.1 -resource "kubernetes_role_binding_v1" "kuberay_batch_jobs" { - metadata { - name = "kuberay-operator-batch-jobs" - namespace = var.namespace - } - - subject { - kind = "ServiceAccount" - name = "kuberay-operator" - namespace = var.namespace - } - - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "Role" - name = "kuberay-operator-batch-jobs" - } - - depends_on = [helm_release.kuberay-operator] -} - -resource "kubernetes_role_v1" "kuberay_batch_jobs" { - metadata { - name = "kuberay-operator-batch-jobs" - namespace = var.namespace - } - - rule { - api_groups = ["batch"] - resources = ["jobs"] - verbs = ["*"] - } - - depends_on = [helm_release.kuberay-operator] -} diff --git a/modules/kuberay-operator/variables.tf b/modules/kuberay-operator/variables.tf deleted file mode 100644 index a977c27de..000000000 --- a/modules/kuberay-operator/variables.tf +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -variable "project_id" { - type = string - description = "GCP project id" -} - -variable "name" { - type = string - default = "kuberay-operator" -} - -variable "namespace" { - type = string - description = "Kubernetes namespace where resources are deployed" -} - -variable "create_namespace" { - type = bool -} - -variable "autopilot_cluster" { - type = bool -} diff --git a/modules/kuberay-operator/versions.tf b/modules/kuberay-operator/versions.tf deleted file mode 100644 index 330033278..000000000 --- a/modules/kuberay-operator/versions.tf +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -terraform { - required_providers { - helm = { - source = "hashicorp/helm" - version = "~> 2.8.0" - } - kubernetes = { - source = "hashicorp/kubernetes" - version = "2.18.1" - } - } - provider_meta "google" { - module_name = "blueprints/terraform/terraform-google-kubernetes-engine:kuberay/v0.1.0" - } -} diff --git a/scripts/ci/wait_for_pods.sh b/scripts/ci/wait_for_pods.sh new file mode 100755 index 000000000..228b37d79 --- /dev/null +++ b/scripts/ci/wait_for_pods.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Define the namespace to watch +NAMESPACE=$1 +TIMEOUT=$2 +START_TIME=$(date +%s) + +# Check if namespace is provided +if [[ -z "$NAMESPACE" ]]; then + echo "Usage: $0 " + exit 1 +fi + +echo "Waiting for any pod to exist in the namespace '$NAMESPACE' (timeout: ${TIMEOUT}s)..." + +# Loop until a pod exists in the namespace or timeout occurs +while true; do + POD_COUNT=$(kubectl get pods -n "$NAMESPACE" --no-headers 2>/dev/null | wc -l) + + if [[ "$POD_COUNT" -gt 0 ]]; then + echo "Pod(s) found in the namespace '$NAMESPACE'." + break + fi + + CURRENT_TIME=$(date +%s) + ELAPSED_TIME=$((CURRENT_TIME - START_TIME)) + + if [[ "$ELAPSED_TIME" -ge "$TIMEOUT" ]]; then + echo "Timeout reached after ${TIMEOUT} seconds. No pods found in the namespace '$NAMESPACE'." + exit 1 + fi + + echo "No pods found yet in the namespace '$NAMESPACE'. Checking again in 30 seconds..." + sleep 30 +done