From 6240198b76ba7ced8f21b92bc84f4db6794ec606 Mon Sep 17 00:00:00 2001 From: Aaron Rueth Date: Wed, 18 Sep 2024 18:24:46 +0000 Subject: [PATCH] Uncomment CI steps --- cloudbuild.yaml | 686 ++++++++++++++++++++++++------------------------ 1 file changed, 343 insertions(+), 343 deletions(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index d5ea41498..169ab4a41 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -45,343 +45,343 @@ steps: dir: "applications/rag/" waitFor: ["validate platform"] - # Create cluster to test ray, jupyterhub, rag - # - id: 'create gke cluster' - # name: 'gcr.io/$PROJECT_ID/terraform' - # env: - # - "KUBE_LOAD_CONFIG_FILE=false" - # entrypoint: 'sh' - # args: - # - '-c' - # - | - # set -e - - # terraform apply \ - # -var-file=tfvars_tests/standard-gke-public.platform.tfvars \ - # -var=project_id=$PROJECT_ID \ - # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=subnetwork_region=$_REGION \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \ - # -var=cluster_location=$_REGION \ - # -var='cpu_pools=[{initial_node_count=2,name="cpu-pool",machine_type="n1-standard-16",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-standard",}]' \ - # -var='gpu_pools=[{initial_node_count=2,name="gpu-pool",machine_type="g2-standard-24",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-balanced",accelerator_count=2,accelerator_type="nvidia-l4",gpu_driver_version="DEFAULT",}]' \ - # -auto-approve -no-color - # echo "pass" > /workspace/gke_cluster_result.txt - # dir: 'infrastructure/' - # allowFailure: true - # waitFor: ['validate platform', 'validate ray', 'validate jupyterhub', validate rag] - - # - id: 'test ray cluster' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'sh' - # args: - # - '-c' - # - | - # set -e - - # # Get kube config - # gcloud container clusters get-credentials \ - # ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # --location $_REGION \ - # --project $PROJECT_ID - - # cd /workspace/applications/ray/ - # terraform apply \ - # -var-file=workloads.tfvars \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-ray \ - # -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \ - # -var=enable_gpu=true \ - # -auto-approve -no-color - # echo "pass" > /workspace/user_result.txt - - # chmod +x /workspace/scripts/ci/wait_for_pods.sh - # /workspace/scripts/ci/wait_for_pods.sh ml-$SHORT_SHA-$_BUILD_ID-ray 3000 - - # kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID-ray --for=condition=Ready --timeout=1200s - # # Ray head's readinessProbe is not probing the head service today. Therefore the wait for ready above is not reliable. - # sleep 60s - # kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-ray service/ray-cluster-kuberay-head-svc 8265:8265 & - # # Wait port-forwarding to take its place - # sleep 10s - - # ray job submit \ - # --address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())" - # echo "pass" > /workspace/ray_result.txt - # allowFailure: true - # waitFor: ['create gke cluster'] - - # - id: 'cleanup ray cluster' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # set -e - - # cd /workspace/applications/ray/ - # terraform destroy \ - # -var-file=workloads.tfvars \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-ray \ - # -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \ - # -var=enable_gpu=true \ - # -auto-approve -no-color - # allowFailure: true - # waitFor: ['test ray cluster'] - - # - id: 'test jupyterhub' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # set -e - - # cd /workspace/modules/jupyter/tests - # python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER - - # cd /workspace/applications/jupyter - # terraform apply \ - # -var-file=workloads-without-iap.example.tfvars \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \ - # -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \ - # -auto-approve -no-color - # echo "pass" > /workspace/jupyterhub_tf_result.txt - - # kubectl wait --for=condition=Ready pods -n ml-$SHORT_SHA-$_BUILD_ID-jupyter -l 'component!=continuous-image-puller' --timeout=1800s - # kubectl get services -n ml-$SHORT_SHA-$_BUILD_ID-jupyter - # kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-jupyter service/proxy-public 9442:80 & - # # Wait port-forwarding to take its place - # sleep 5s - - # cd /workspace/modules/jupyter/tests - # python3 test_hub.py "127.0.0.1:9442" $_AUTOPILOT_CLUSTER - # echo "pass" > /workspace/jupyterhub_test_result.txt - # allowFailure: true - # waitFor: ['create gke cluster'] - - # - id: 'cleanup jupyterhub' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # set -e - - # cd /workspace/applications/jupyter/ - # terraform destroy \ - # -var-file=workloads-without-iap.example.tfvars \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \ - # -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \ - # -auto-approve -no-color - # allowFailure: true - # waitFor: ['test jupyterhub'] - - # - id: 'test rag' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'sh' - # secretEnv: ['KAGGLE_USERNAME', 'KAGGLE_KEY'] - # args: - # - '-c' - # - | - # set -e - - # # Get kube config - # gcloud container clusters get-credentials \ - # ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # --location $_REGION \ - # --project $PROJECT_ID - - # cd /workspace/modules/jupyter/tests - # python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER - - # cd /workspace/applications/rag/ - # terraform apply \ - # -var-file=workloads.tfvars \ - # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=create_cluster=false \ - # -var=jupyter_add_auth=false \ - # -var=frontend_add_auth=false \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ - # -var=ray_service_account=ray-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ - # -var=rag_service_account=rag-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ - # -var=jupyter_service_account=jupyter-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ - # -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ - # -auto-approve -no-color - # echo "pass" > /workspace/rag_tf_result.txt - - # # Validate Ray: Make sure pods are running - # kubectl wait --for=condition=Ready pods -n rag-$SHORT_SHA-$_BUILD_ID -l 'component!=continuous-image-puller' --timeout=1200s - # kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/ray-cluster-kuberay-head-svc 8262:8265 & - # # Wait port-forwarding to take its place - # sleep 5s - - # # Validate Ray: Check dashboard - # ray job submit --working-dir ./tests \ - # --address=http://127.0.0.1:8262 -- python -c "import ray; ray.init(); print(ray.cluster_resources())" - # echo "pass" > /workspace/rag_ray_dashboard_result.txt - - # # Validate JupyterHub: Get hub url - # kubectl get services -n rag-$SHORT_SHA-$_BUILD_ID - # kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/proxy-public 9443:80 & - # # Wait port-forwarding to take its place - # sleep 5s - - # # Validate JupyterHub: Test Hub - # cd /workspace/modules/jupyter/tests - # python3 test_hub.py "127.0.0.1:9443" $_AUTOPILOT_CLUSTER - # echo "pass" > /workspace/rag_jupyterhub_test_result.txt - - # # Validate RAG: Test rag frontend - # kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/rag-frontend 8081:8080 & - # # Wait port-forwarding to take its place - # sleep 5s - - # cd /workspace/applications/rag/tests - # python3 test_frontend.py "127.0.0.1:8081" - # echo "pass" > /workspace/rag_frontend_result.txt - - # cd /workspace/ - # sed -i "s//$$KAGGLE_USERNAME/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb - # sed -i "s//$$KAGGLE_KEY/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb - # gsutil cp ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb gs://gke-aieco-rag-$SHORT_SHA-$_BUILD_ID/ - # kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID $(kubectl get pod -l app=jupyterhub,component=hub -n rag-$SHORT_SHA-$_BUILD_ID -o jsonpath="{.items[0].metadata.name}") -- jupyterhub token admin --log-level=CRITICAL | xargs python3 ./applications/rag/notebook_starter.py - # # Wait for jupyterhub to trigger notebook pod startup - # sleep 5s - # kubectl wait --for=condition=Ready pod/jupyter-admin -n rag-$SHORT_SHA-$_BUILD_ID --timeout=500s - # kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- jupyter nbconvert --to script /data/rag-kaggle-ray-sql-interactive.ipynb - # kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- ipython /data/rag-kaggle-ray-sql-interactive.py - - # python3 ./applications/rag/tests/test_rag.py "http://127.0.0.1:8081/prompt" - # echo "pass" > /workspace/rag_prompt_result.txt - - # allowFailure: true - # waitFor: ['create gke cluster'] - - # - id: 'cleanup rag' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # set -e - - # cd /workspace/applications/rag/ - # terraform destroy \ - # -var-file=workloads.tfvars \ - # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=create_cluster=false \ - # -var=jupyter_add_auth=false \ - # -var=frontend_add_auth=false \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ - # -var=ray_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=rag_service_account=rag-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=jupyter_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ - # -auto-approve -no-color - # allowFailure: true - # waitFor: ['test rag'] - - # - id: 'cleanup gke cluster' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # set -e - - # cd /workspace/infrastructure - # terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \ - # -var=cluster_location=$_REGION -auto-approve -no-color - - # allowFailure: true - # waitFor: ['cleanup rag', 'cleanup jupyterhub', 'cleanup ray cluster'] - - # - id: 'check result' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # if [[ $(cat /workspace/gke_cluster_result.txt) != "pass" ]]; then - # echo "gke cluster creation failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/ray_result.txt) != "pass" ]]; then - # echo "ray API run failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/user_result.txt) != "pass" ]]; then - # echo "ray cluster failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/jupyterhub_tf_result.txt) != "pass" ]]; then - # echo "jupyterhub tf failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/jupyterhub_test_result.txt) != "pass" ]]; then - # echo "jupyterhub test failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/rag_tf_result.txt) != "pass" ]]; then - # echo "rag tf failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/rag_ray_dashboard_result.txt) != "pass" ]]; then - # echo "rag ray dashboard test failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/rag_jupyterhub_test_result.txt) != "pass" ]]; then - # echo "rag jupyterhub test failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/rag_frontend_result.txt) != "pass" ]]; then - # echo "rag frontend test failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/rag_prompt_result.txt) != "pass" ]]; then - # echo "rag prompt test failed" - # exit 1 - # fi - # waitFor: ['cleanup gke cluster'] + #Create cluster to test ray, jupyterhub, rag + - id: 'create gke cluster' + name: 'gcr.io/$PROJECT_ID/terraform' + env: + - "KUBE_LOAD_CONFIG_FILE=false" + entrypoint: 'sh' + args: + - '-c' + - | + set -e + + terraform apply \ + -var-file=tfvars_tests/standard-gke-public.platform.tfvars \ + -var=project_id=$PROJECT_ID \ + -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + -var=subnetwork_region=$_REGION \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \ + -var=cluster_location=$_REGION \ + -var='cpu_pools=[{initial_node_count=2,name="cpu-pool",machine_type="n1-standard-16",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-standard",}]' \ + -var='gpu_pools=[{initial_node_count=2,name="gpu-pool",machine_type="g2-standard-24",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-balanced",accelerator_count=2,accelerator_type="nvidia-l4",gpu_driver_version="DEFAULT",}]' \ + -auto-approve -no-color + echo "pass" > /workspace/gke_cluster_result.txt + dir: 'infrastructure/' + allowFailure: true + waitFor: ['validate platform', 'validate ray', 'validate jupyterhub', validate rag] + + - id: 'test ray cluster' + name: 'gcr.io/$PROJECT_ID/terraform' + entrypoint: 'sh' + args: + - '-c' + - | + set -e + + # Get kube config + gcloud container clusters get-credentials \ + ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + --location $_REGION \ + --project $PROJECT_ID + + cd /workspace/applications/ray/ + terraform apply \ + -var-file=workloads.tfvars \ + -var=project_id=$PROJECT_ID \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=cluster_location=$_REGION \ + -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-ray \ + -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ + -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \ + -var=enable_gpu=true \ + -auto-approve -no-color + echo "pass" > /workspace/user_result.txt + + chmod +x /workspace/scripts/ci/wait_for_pods.sh + /workspace/scripts/ci/wait_for_pods.sh ml-$SHORT_SHA-$_BUILD_ID-ray 3000 + + kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID-ray --for=condition=Ready --timeout=1200s + # Ray head's readinessProbe is not probing the head service today. Therefore the wait for ready above is not reliable. + sleep 60s + kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-ray service/ray-cluster-kuberay-head-svc 8265:8265 & + # Wait port-forwarding to take its place + sleep 10s + + ray job submit \ + --address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())" + echo "pass" > /workspace/ray_result.txt + allowFailure: true + waitFor: ['create gke cluster'] + + - id: 'cleanup ray cluster' + name: 'gcr.io/$PROJECT_ID/terraform' + entrypoint: 'bash' + args: + - '-c' + - | + set -e + + cd /workspace/applications/ray/ + terraform destroy \ + -var-file=workloads.tfvars \ + -var=project_id=$PROJECT_ID \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=cluster_location=$_REGION \ + -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-ray \ + -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ + -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \ + -var=enable_gpu=true \ + -auto-approve -no-color + allowFailure: true + waitFor: ['test ray cluster'] + + - id: 'test jupyterhub' + name: 'gcr.io/$PROJECT_ID/terraform' + entrypoint: 'bash' + args: + - '-c' + - | + set -e + + cd /workspace/modules/jupyter/tests + python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER + + cd /workspace/applications/jupyter + terraform apply \ + -var-file=workloads-without-iap.example.tfvars \ + -var=project_id=$PROJECT_ID \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=cluster_location=$_REGION \ + -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \ + -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ + -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \ + -auto-approve -no-color + echo "pass" > /workspace/jupyterhub_tf_result.txt + + kubectl wait --for=condition=Ready pods -n ml-$SHORT_SHA-$_BUILD_ID-jupyter -l 'component!=continuous-image-puller' --timeout=1800s + kubectl get services -n ml-$SHORT_SHA-$_BUILD_ID-jupyter + kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-jupyter service/proxy-public 9442:80 & + # Wait port-forwarding to take its place + sleep 5s + + cd /workspace/modules/jupyter/tests + python3 test_hub.py "127.0.0.1:9442" $_AUTOPILOT_CLUSTER + echo "pass" > /workspace/jupyterhub_test_result.txt + allowFailure: true + waitFor: ['create gke cluster'] + + - id: 'cleanup jupyterhub' + name: 'gcr.io/$PROJECT_ID/terraform' + entrypoint: 'bash' + args: + - '-c' + - | + set -e + + cd /workspace/applications/jupyter/ + terraform destroy \ + -var-file=workloads-without-iap.example.tfvars \ + -var=project_id=$PROJECT_ID \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=cluster_location=$_REGION \ + -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \ + -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ + -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \ + -auto-approve -no-color + allowFailure: true + waitFor: ['test jupyterhub'] + + - id: 'test rag' + name: 'gcr.io/$PROJECT_ID/terraform' + entrypoint: 'sh' + secretEnv: ['KAGGLE_USERNAME', 'KAGGLE_KEY'] + args: + - '-c' + - | + set -e + + # Get kube config + gcloud container clusters get-credentials \ + ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + --location $_REGION \ + --project $PROJECT_ID + + cd /workspace/modules/jupyter/tests + python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER + + cd /workspace/applications/rag/ + terraform apply \ + -var-file=workloads.tfvars \ + -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + -var=create_cluster=false \ + -var=jupyter_add_auth=false \ + -var=frontend_add_auth=false \ + -var=project_id=$PROJECT_ID \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=cluster_location=$_REGION \ + -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ + -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ + -var=ray_service_account=ray-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ + -var=rag_service_account=rag-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ + -var=jupyter_service_account=jupyter-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ + -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ + -auto-approve -no-color + echo "pass" > /workspace/rag_tf_result.txt + + # Validate Ray: Make sure pods are running + kubectl wait --for=condition=Ready pods -n rag-$SHORT_SHA-$_BUILD_ID -l 'component!=continuous-image-puller' --timeout=1200s + kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/ray-cluster-kuberay-head-svc 8262:8265 & + # Wait port-forwarding to take its place + sleep 5s + + # Validate Ray: Check dashboard + ray job submit --working-dir ./tests \ + --address=http://127.0.0.1:8262 -- python -c "import ray; ray.init(); print(ray.cluster_resources())" + echo "pass" > /workspace/rag_ray_dashboard_result.txt + + # Validate JupyterHub: Get hub url + kubectl get services -n rag-$SHORT_SHA-$_BUILD_ID + kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/proxy-public 9443:80 & + # Wait port-forwarding to take its place + sleep 5s + + # Validate JupyterHub: Test Hub + cd /workspace/modules/jupyter/tests + python3 test_hub.py "127.0.0.1:9443" $_AUTOPILOT_CLUSTER + echo "pass" > /workspace/rag_jupyterhub_test_result.txt + + # Validate RAG: Test rag frontend + kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/rag-frontend 8081:8080 & + # Wait port-forwarding to take its place + sleep 5s + + cd /workspace/applications/rag/tests + python3 test_frontend.py "127.0.0.1:8081" + echo "pass" > /workspace/rag_frontend_result.txt + + cd /workspace/ + sed -i "s//$$KAGGLE_USERNAME/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb + sed -i "s//$$KAGGLE_KEY/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb + gsutil cp ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb gs://gke-aieco-rag-$SHORT_SHA-$_BUILD_ID/ + kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID $(kubectl get pod -l app=jupyterhub,component=hub -n rag-$SHORT_SHA-$_BUILD_ID -o jsonpath="{.items[0].metadata.name}") -- jupyterhub token admin --log-level=CRITICAL | xargs python3 ./applications/rag/notebook_starter.py + # Wait for jupyterhub to trigger notebook pod startup + sleep 5s + kubectl wait --for=condition=Ready pod/jupyter-admin -n rag-$SHORT_SHA-$_BUILD_ID --timeout=500s + kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- jupyter nbconvert --to script /data/rag-kaggle-ray-sql-interactive.ipynb + kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- ipython /data/rag-kaggle-ray-sql-interactive.py + + python3 ./applications/rag/tests/test_rag.py "http://127.0.0.1:8081/prompt" + echo "pass" > /workspace/rag_prompt_result.txt + + allowFailure: true + waitFor: ['create gke cluster'] + + - id: 'cleanup rag' + name: 'gcr.io/$PROJECT_ID/terraform' + entrypoint: 'bash' + args: + - '-c' + - | + set -e + + cd /workspace/applications/rag/ + terraform destroy \ + -var-file=workloads.tfvars \ + -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + -var=create_cluster=false \ + -var=jupyter_add_auth=false \ + -var=frontend_add_auth=false \ + -var=project_id=$PROJECT_ID \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=cluster_location=$_REGION \ + -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ + -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ + -var=ray_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ + -var=rag_service_account=rag-sa-$SHORT_SHA-$_BUILD_ID \ + -var=jupyter_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ + -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ + -auto-approve -no-color + allowFailure: true + waitFor: ['test rag'] + + - id: 'cleanup gke cluster' + name: 'gcr.io/$PROJECT_ID/terraform' + entrypoint: 'bash' + args: + - '-c' + - | + set -e + + cd /workspace/infrastructure + terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \ + -var=cluster_location=$_REGION -auto-approve -no-color + + allowFailure: true + waitFor: ['cleanup rag', 'cleanup jupyterhub', 'cleanup ray cluster'] + + - id: 'check result' + name: 'gcr.io/$PROJECT_ID/terraform' + entrypoint: 'bash' + args: + - '-c' + - | + if [[ $(cat /workspace/gke_cluster_result.txt) != "pass" ]]; then + echo "gke cluster creation failed" + exit 1 + fi + + if [[ $(cat /workspace/ray_result.txt) != "pass" ]]; then + echo "ray API run failed" + exit 1 + fi + + if [[ $(cat /workspace/user_result.txt) != "pass" ]]; then + echo "ray cluster failed" + exit 1 + fi + + if [[ $(cat /workspace/jupyterhub_tf_result.txt) != "pass" ]]; then + echo "jupyterhub tf failed" + exit 1 + fi + + if [[ $(cat /workspace/jupyterhub_test_result.txt) != "pass" ]]; then + echo "jupyterhub test failed" + exit 1 + fi + + if [[ $(cat /workspace/rag_tf_result.txt) != "pass" ]]; then + echo "rag tf failed" + exit 1 + fi + + if [[ $(cat /workspace/rag_ray_dashboard_result.txt) != "pass" ]]; then + echo "rag ray dashboard test failed" + exit 1 + fi + + if [[ $(cat /workspace/rag_jupyterhub_test_result.txt) != "pass" ]]; then + echo "rag jupyterhub test failed" + exit 1 + fi + + if [[ $(cat /workspace/rag_frontend_result.txt) != "pass" ]]; then + echo "rag frontend test failed" + exit 1 + fi + + if [[ $(cat /workspace/rag_prompt_result.txt) != "pass" ]]; then + echo "rag prompt test failed" + exit 1 + fi + waitFor: ['cleanup gke cluster'] substitutions: _REGION: us-east4 @@ -393,9 +393,9 @@ options: substitutionOption: "ALLOW_LOOSE" machineType: "E2_HIGHCPU_8" timeout: 5400s -# availableSecrets: -# secretManager: -# - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-username/versions/latest -# env: "KAGGLE_USERNAME" -# - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-key/versions/latest -# env: "KAGGLE_KEY" +availableSecrets: + secretManager: + - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-username/versions/latest + env: "KAGGLE_USERNAME" + - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-key/versions/latest + env: "KAGGLE_KEY"