From 4e640a9086fdf556d7664be0cf8ccda46198eff6 Mon Sep 17 00:00:00 2001 From: artemvmin Date: Tue, 9 Apr 2024 16:52:10 -0700 Subject: [PATCH] Remove timeout from RayCluster CR apply; bump CB timeout to mitigate stockouts (#576) Remove timeout from RayCluster CR apply RayCluster apply takes O(seconds). The actual ray worker deployment is done asynchronously by the ray operator. --- cloudbuild.yaml | 6 +++--- modules/kuberay-cluster/main.tf | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 0f0ef4c97..2c720b2d1 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -99,7 +99,7 @@ steps: echo "pass" > /workspace/user_result.txt # Make sure pods are running - kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s + kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=1200s kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID service/ray-cluster-kuberay-head-svc 8265:8265 & # Wait port-forwarding to take its place sleep 5s @@ -156,7 +156,7 @@ steps: -auto-approve -no-color -lock=false echo "pass" > /workspace/jupyterhub_tf_result.txt - kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s + kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=1200s kubectl get services -n ml-$SHORT_SHA-$_BUILD_ID kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID service/proxy-public 9443:80 & # Wait port-forwarding to take its place @@ -227,7 +227,7 @@ steps: echo "pass" > /workspace/rag_tf_result.txt # Validate Ray: Make sure pods are running - kubectl wait --all pods -n rag-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s + kubectl wait --all pods -n rag-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=1200s kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/ray-cluster-kuberay-head-svc 8265:8265 & # Wait port-forwarding to take its place sleep 5s diff --git a/modules/kuberay-cluster/main.tf b/modules/kuberay-cluster/main.tf index 67559f8a0..36aaa49bf 100644 --- a/modules/kuberay-cluster/main.tf +++ b/modules/kuberay-cluster/main.tf @@ -34,9 +34,6 @@ resource "helm_release" "ray-cluster" { namespace = var.namespace create_namespace = true version = "1.0.0" - # Timeout is increased to guarantee sufficient scale-up time for Autopilot nodes. - timeout = 1200 - wait = true values = [ templatefile("${path.module}/values.yaml", {