diff --git a/applications/rag/main.tf b/applications/rag/main.tf index 629e607aa..90c3c44e5 100644 --- a/applications/rag/main.tf +++ b/applications/rag/main.tf @@ -64,25 +64,25 @@ module "infra" { source = "../../infrastructure" count = var.create_cluster ? 1 : 0 - project_id = var.project_id - cluster_name = local.cluster_name - cluster_location = var.cluster_location - region = local.cluster_location_region - autopilot_cluster = var.autopilot_cluster - private_cluster = var.private_cluster - create_network = var.create_network - network_name = local.network_name - subnetwork_name = local.network_name - subnetwork_cidr = var.subnetwork_cidr - subnetwork_region = local.cluster_location_region - cpu_pools = var.cpu_pools - enable_gpu = true - gpu_pools = var.gpu_pools - ray_addon_enabled = true + project_id = var.project_id + cluster_name = local.cluster_name + cluster_location = var.cluster_location + region = local.cluster_location_region + autopilot_cluster = var.autopilot_cluster + private_cluster = var.private_cluster + create_network = var.create_network + network_name = local.network_name + subnetwork_name = local.network_name + subnetwork_cidr = var.subnetwork_cidr + subnetwork_region = local.cluster_location_region + cpu_pools = var.cpu_pools + enable_gpu = true + gpu_pools = var.gpu_pools + ray_addon_enabled = true # TODO(genlu): remove channel and k8s_version after ray addon is in REGULAR channel - release_channel = "RAPID" + release_channel = "RAPID" kubernetes_version = "1.30.3-gke.1969000" - depends_on = [module.project-services] + depends_on = [module.project-services] } data "google_container_cluster" "default" { diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 47f4f4765..06e9dcd5b 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -100,10 +100,36 @@ steps: -auto-approve -no-color echo "pass" > /workspace/user_result.txt + wait_for_pod_with_timeout() { + local NAMESPACE=$1 + local TIMEOUT=$2 + local START_TIME=$(date +%s) + + while true; do + POD_COUNT=$(kubectl get pods -n "$NAMESPACE" --no-headers 2>/dev/null | wc -l) + + if [[ "$POD_COUNT" -gt 0 ]]; then + return 0 + fi + + CURRENT_TIME=$(date +%s) + ELAPSED_TIME=$((CURRENT_TIME - START_TIME)) + + if [[ "$ELAPSED_TIME" -ge "$TIMEOUT" ]]; then + echo "Timeout reached after ${TIMEOUT} seconds. No pods found in the namespace '$NAMESPACE'." + return 1 + fi + + echo "No pods found yet in the namespace '$NAMESPACE'. Checking again in 5 seconds..." + sleep 5 + done + } + # Make sure pods are running + wait_for_pod_with_timeout "ml-$SHORT_SHA-$_BUILD_ID-ray" "60" kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID-ray --for=condition=Ready --timeout=1200s - # Wait for pods to be stable - sleep 5s + # Ray head's readinessProbe is not probing the head service today. Therefore the wait for ready above is not reliable. + sleep 60s kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-ray service/ray-cluster-kuberay-head-svc 8265:8265 & # Wait port-forwarding to take its place sleep 10s diff --git a/modules/kuberay-logging/config/fluent-bit.conf b/modules/kuberay-logging/config/fluent-bit.conf deleted file mode 100644 index 878ba1f1f..000000000 --- a/modules/kuberay-logging/config/fluent-bit.conf +++ /dev/null @@ -1,18 +0,0 @@ -[SERVICE] - Parsers_File parsers.conf -[INPUT] - Name tail - Path /tmp/ray/session_latest/logs/* - Tag ray - Path_Key filename - Refresh_Interval 5 -[FILTERS] - Name parser - Match ray - Key_Name filename - Parser rayjob - Reserve_Data On -[OUTPUT] - Name stdout - Format json_lines - Match * \ No newline at end of file diff --git a/modules/kuberay-logging/config/parsers.conf b/modules/kuberay-logging/config/parsers.conf deleted file mode 100644 index 3d0e01573..000000000 --- a/modules/kuberay-logging/config/parsers.conf +++ /dev/null @@ -1,4 +0,0 @@ -[PARSER] - Name rayjob - Format regex - Regex job-driver-(?[^.]*)\.log \ No newline at end of file diff --git a/modules/kuberay-logging/kubernetes.tf b/modules/kuberay-logging/kubernetes.tf deleted file mode 100644 index e69de29bb..000000000 diff --git a/modules/kuberay-logging/main.tf b/modules/kuberay-logging/main.tf deleted file mode 100644 index b0a9d2296..000000000 --- a/modules/kuberay-logging/main.tf +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -resource "kubernetes_config_map" "example" { - metadata { - name = "fluentbit-config" - namespace = var.namespace - } - - data = { - "fluent-bit.conf" = "${file("${path.module}/config/fluent-bit.conf")}" - "parsers.conf" = "${file("${path.module}/config/parsers.conf")}" - } - -} diff --git a/modules/kuberay-logging/variables.tf b/modules/kuberay-logging/variables.tf deleted file mode 100644 index 3f2342703..000000000 --- a/modules/kuberay-logging/variables.tf +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -variable "namespace" { - type = string - description = "Kubernetes namespace where resources are deployed" - default = "ray" -} diff --git a/modules/kuberay-logging/versions.tf b/modules/kuberay-logging/versions.tf deleted file mode 100644 index f04476982..000000000 --- a/modules/kuberay-logging/versions.tf +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -terraform { - required_providers { - helm = { - source = "hashicorp/helm" - version = "~> 2.8.0" - } - kubernetes = { - source = "hashicorp/kubernetes" - version = "2.18.1" - } - # kubectl = { - # source = "alekc/kubectl" - # version = "2.0.3" - # } - } - provider_meta "google" { - module_name = "blueprints/terraform/terraform-google-kubernetes-engine:kuberay/v0.1.0" - } -} diff --git a/modules/kuberay-operator/kuberay-operator-autopilot-values.yaml b/modules/kuberay-operator/kuberay-operator-autopilot-values.yaml deleted file mode 100644 index ed978c7dd..000000000 --- a/modules/kuberay-operator/kuberay-operator-autopilot-values.yaml +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Default values for kuberay-operator. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -image: - repository: kuberay/operator - tag: v1.0.0 - pullPolicy: IfNotPresent - -nameOverride: "kuberay-operator" -fullnameOverride: "kuberay-operator" - -serviceAccount: - # Specifies whether a service account should be created - create: true - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: "kuberay-operator" - -service: - type: ClusterIP - port: 8080 - -resources: - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do whelm to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - limits: - cpu: 100m - # Anecdotally, managing 500 Ray pods requires roughly 500MB memory. - # Monitor memory usage and adjust as needed. - memory: 512Mi - -livenessProbe: - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 5 - -readinessProbe: - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 5 - -batchScheduler: - enabled: false - -# Set up `securityContext` to improve Pod security. -# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/pod-security.md for further guidance. -securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: true - capabilities: - drop: - - ALL - runAsNonRoot: true - - -# If rbacEnable is set to false, no RBAC resources will be created, including the Role for leader election, the Role for Pods and Services, and so on. -rbacEnable: true - -# When crNamespacedRbacEnable is set to true, the KubeRay operator will create a Role for RayCluster preparation (e.g., Pods, Services) -# and a corresponding RoleBinding for each namespace listed in the "watchNamespace" parameter. Please note that even if crNamespacedRbacEnable -# is set to false, the Role and RoleBinding for leader election will still be created. -# -# Note: -# (1) This variable is only effective when rbacEnable and singleNamespaceInstall are both set to true. -# (2) In most cases, it should be set to true, unless you are using a Kubernetes cluster managed by GitOps tools such as ArgoCD. -crNamespacedRbacEnable: true - -# When singleNamespaceInstall is true: -# - Install namespaced RBAC resources such as Role and RoleBinding instead of cluster-scoped ones like ClusterRole and ClusterRoleBinding so that -# the chart can be installed by users with permissions restricted to a single namespace. -# (Please note that this excludes the CRDs, which can only be installed at the cluster scope.) -# - If "watchNamespace" is not set, the KubeRay operator will, by default, only listen -# to resource events within its own namespace. -singleNamespaceInstall: true - -# The KubeRay operator will watch the custom resources in the namespaces listed in the "watchNamespace" parameter. -# watchNamespace: -# - n1 -# - n2 - -# Environment variables -env: -# If not set or set to true, kuberay auto injects an init container waiting for ray GCS. -# If false, you will need to inject your own init container to ensure ray GCS is up before the ray workers start. -# Warning: we highly recommend setting to true and let kuberay handle for you. -# Note: This is disabled because gcs fuse csi based volumes cannot be mounted in init containers. -- name: ENABLE_INIT_CONTAINER_INJECTION - value: "false" -# If not set or set to "", kuberay will pick up the default k8s cluster domain `cluster.local` -# Otherwise, kuberay will use your custom domain -# - name: CLUSTER_DOMAIN -# value: "" -# If not set or set to false, when running on OpenShift with Ingress creation enabled, kuberay will create OpenShift route -# Otherwise, regardless of the type of cluster with Ingress creation enabled, kuberay will create Ingress -# - name: USE_INGRESS_ON_OPENSHIFT -# value: "true" -# Unconditionally requeue after the number of seconds specified in the -# environment variable RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV. If the -# environment variable is not set, requeue after the default value (300). -# - name: RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV -# value: 300 -# If not set or set to "true", KubeRay will clean up the Redis storage namespace when a GCS FT-enabled RayCluster is deleted. -# - name: ENABLE_GCS_FT_REDIS_CLEANUP -# value: "true" diff --git a/modules/kuberay-operator/kuberay-operator-values.yaml b/modules/kuberay-operator/kuberay-operator-values.yaml deleted file mode 100644 index ed978c7dd..000000000 --- a/modules/kuberay-operator/kuberay-operator-values.yaml +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Default values for kuberay-operator. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -image: - repository: kuberay/operator - tag: v1.0.0 - pullPolicy: IfNotPresent - -nameOverride: "kuberay-operator" -fullnameOverride: "kuberay-operator" - -serviceAccount: - # Specifies whether a service account should be created - create: true - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: "kuberay-operator" - -service: - type: ClusterIP - port: 8080 - -resources: - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do whelm to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - limits: - cpu: 100m - # Anecdotally, managing 500 Ray pods requires roughly 500MB memory. - # Monitor memory usage and adjust as needed. - memory: 512Mi - -livenessProbe: - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 5 - -readinessProbe: - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 5 - -batchScheduler: - enabled: false - -# Set up `securityContext` to improve Pod security. -# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/pod-security.md for further guidance. -securityContext: - allowPrivilegeEscalation: false - readOnlyRootFilesystem: true - capabilities: - drop: - - ALL - runAsNonRoot: true - - -# If rbacEnable is set to false, no RBAC resources will be created, including the Role for leader election, the Role for Pods and Services, and so on. -rbacEnable: true - -# When crNamespacedRbacEnable is set to true, the KubeRay operator will create a Role for RayCluster preparation (e.g., Pods, Services) -# and a corresponding RoleBinding for each namespace listed in the "watchNamespace" parameter. Please note that even if crNamespacedRbacEnable -# is set to false, the Role and RoleBinding for leader election will still be created. -# -# Note: -# (1) This variable is only effective when rbacEnable and singleNamespaceInstall are both set to true. -# (2) In most cases, it should be set to true, unless you are using a Kubernetes cluster managed by GitOps tools such as ArgoCD. -crNamespacedRbacEnable: true - -# When singleNamespaceInstall is true: -# - Install namespaced RBAC resources such as Role and RoleBinding instead of cluster-scoped ones like ClusterRole and ClusterRoleBinding so that -# the chart can be installed by users with permissions restricted to a single namespace. -# (Please note that this excludes the CRDs, which can only be installed at the cluster scope.) -# - If "watchNamespace" is not set, the KubeRay operator will, by default, only listen -# to resource events within its own namespace. -singleNamespaceInstall: true - -# The KubeRay operator will watch the custom resources in the namespaces listed in the "watchNamespace" parameter. -# watchNamespace: -# - n1 -# - n2 - -# Environment variables -env: -# If not set or set to true, kuberay auto injects an init container waiting for ray GCS. -# If false, you will need to inject your own init container to ensure ray GCS is up before the ray workers start. -# Warning: we highly recommend setting to true and let kuberay handle for you. -# Note: This is disabled because gcs fuse csi based volumes cannot be mounted in init containers. -- name: ENABLE_INIT_CONTAINER_INJECTION - value: "false" -# If not set or set to "", kuberay will pick up the default k8s cluster domain `cluster.local` -# Otherwise, kuberay will use your custom domain -# - name: CLUSTER_DOMAIN -# value: "" -# If not set or set to false, when running on OpenShift with Ingress creation enabled, kuberay will create OpenShift route -# Otherwise, regardless of the type of cluster with Ingress creation enabled, kuberay will create Ingress -# - name: USE_INGRESS_ON_OPENSHIFT -# value: "true" -# Unconditionally requeue after the number of seconds specified in the -# environment variable RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV. If the -# environment variable is not set, requeue after the default value (300). -# - name: RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV -# value: 300 -# If not set or set to "true", KubeRay will clean up the Redis storage namespace when a GCS FT-enabled RayCluster is deleted. -# - name: ENABLE_GCS_FT_REDIS_CLEANUP -# value: "true" diff --git a/modules/kuberay-operator/kuberay.tf b/modules/kuberay-operator/kuberay.tf deleted file mode 100644 index e754bdf89..000000000 --- a/modules/kuberay-operator/kuberay.tf +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -resource "helm_release" "kuberay-operator" { - name = var.name - repository = "https://ray-project.github.io/kuberay-helm/" - chart = "kuberay-operator" - values = var.autopilot_cluster ? [file("${path.module}/kuberay-operator-autopilot-values.yaml")] : [file("${path.module}/kuberay-operator-values.yaml")] - version = "1.0.0" - namespace = var.namespace - cleanup_on_fail = "true" - create_namespace = var.create_namespace -} - -# Grant access to batchv1/Jobs to kuberay-operator since the kuberay-operator role is missing some permissions. -# See https://github.com/ray-project/kuberay/issues/1706 for more details. -# TODO: remove this role binding once the kuberay-operator helm chart is upgraded to v1.1 -resource "kubernetes_role_binding_v1" "kuberay_batch_jobs" { - metadata { - name = "kuberay-operator-batch-jobs" - namespace = var.namespace - } - - subject { - kind = "ServiceAccount" - name = "kuberay-operator" - namespace = var.namespace - } - - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "Role" - name = "kuberay-operator-batch-jobs" - } - - depends_on = [helm_release.kuberay-operator] -} - -resource "kubernetes_role_v1" "kuberay_batch_jobs" { - metadata { - name = "kuberay-operator-batch-jobs" - namespace = var.namespace - } - - rule { - api_groups = ["batch"] - resources = ["jobs"] - verbs = ["*"] - } - - depends_on = [helm_release.kuberay-operator] -} diff --git a/modules/kuberay-operator/variables.tf b/modules/kuberay-operator/variables.tf deleted file mode 100644 index a977c27de..000000000 --- a/modules/kuberay-operator/variables.tf +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -variable "project_id" { - type = string - description = "GCP project id" -} - -variable "name" { - type = string - default = "kuberay-operator" -} - -variable "namespace" { - type = string - description = "Kubernetes namespace where resources are deployed" -} - -variable "create_namespace" { - type = bool -} - -variable "autopilot_cluster" { - type = bool -} diff --git a/modules/kuberay-operator/versions.tf b/modules/kuberay-operator/versions.tf deleted file mode 100644 index 330033278..000000000 --- a/modules/kuberay-operator/versions.tf +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -terraform { - required_providers { - helm = { - source = "hashicorp/helm" - version = "~> 2.8.0" - } - kubernetes = { - source = "hashicorp/kubernetes" - version = "2.18.1" - } - } - provider_meta "google" { - module_name = "blueprints/terraform/terraform-google-kubernetes-engine:kuberay/v0.1.0" - } -}