diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 5d1c45bdc..bcace1643 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -13,376 +13,376 @@ # limitations under the License. steps: - - id: 'validate platform' - name: 'gcr.io/$PROJECT_ID/terraform' + - id: "validate platform" + name: "gcr.io/$PROJECT_ID/terraform" script: | terraform init -no-color terraform validate -no-color - dir: 'infrastructure/' - waitFor: ['-'] - - - id: 'validate ray' - name: 'gcr.io/$PROJECT_ID/terraform' + dir: "infrastructure/" + waitFor: ["-"] + + - id: "validate ray" + name: "gcr.io/$PROJECT_ID/terraform" script: | terraform init -no-color terraform validate -no-color - dir: 'applications/ray/' - waitFor: ['validate platform'] - - - id: 'validate jupyterhub' - name: 'gcr.io/$PROJECT_ID/terraform' + dir: "applications/ray/" + waitFor: ["validate platform"] + + - id: "validate jupyterhub" + name: "gcr.io/$PROJECT_ID/terraform" script: | terraform init -no-color terraform validate -no-color - dir: 'applications/jupyter/' - waitFor: ['validate platform'] - - - id: 'validate rag' - name: 'gcr.io/$PROJECT_ID/terraform' + dir: "applications/jupyter/" + waitFor: ["validate platform"] + + - id: "validate rag" + name: "gcr.io/$PROJECT_ID/terraform" script: | terraform init -no-color terraform validate -no-color - dir: 'applications/rag/' - waitFor: ['validate platform'] + dir: "applications/rag/" + waitFor: ["validate platform"] # Create cluster to test ray, jupyterhub, rag - - id: 'create gke cluster' - name: 'gcr.io/$PROJECT_ID/terraform' - env: - - "KUBE_LOAD_CONFIG_FILE=false" - entrypoint: 'sh' - args: - - '-c' - - | - set -e - - terraform apply \ - -var-file=tfvars_tests/standard-gke-public.platform.tfvars \ - -var=project_id=$PROJECT_ID \ - -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - -var=subnetwork_region=$_REGION \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \ - -var=cluster_location=$_REGION \ - -var='cpu_pools=[{initial_node_count=2,name="cpu-pool",machine_type="n1-standard-16",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-standard",}]' \ - -var='gpu_pools=[{initial_node_count=2,name="gpu-pool",machine_type="g2-standard-24",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-balanced",accelerator_count=2,accelerator_type="nvidia-l4",gpu_driver_version="DEFAULT",}]' \ - -auto-approve -no-color - echo "pass" > /workspace/gke_cluster_result.txt - dir: 'infrastructure/' - allowFailure: true - waitFor: ['validate platform', 'validate ray', 'validate jupyterhub', validate rag] - - - id: 'test ray cluster' - name: 'gcr.io/$PROJECT_ID/terraform' - entrypoint: 'sh' - args: - - '-c' - - | - set -e - - # Get kube config - gcloud container clusters get-credentials \ - ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - --location $_REGION \ - --project $PROJECT_ID - - cd /workspace/applications/ray/ - terraform apply \ - -var-file=workloads.tfvars \ - -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - -var=cluster_location=$_REGION \ - -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-ray \ - -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ - -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \ - -var=enable_gpu=true \ - -auto-approve -no-color - echo "pass" > /workspace/user_result.txt - - chmod +x /workspace/scripts/ci/wait_for_pods.sh - /workspace/scripts/ci/wait_for_pods.sh ml-$SHORT_SHA-$_BUILD_ID-ray 3000 - - kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID-ray --for=condition=Ready --timeout=1200s - # Ray head's readinessProbe is not probing the head service today. Therefore the wait for ready above is not reliable. - sleep 60s - kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-ray service/ray-cluster-kuberay-head-svc 8265:8265 & - # Wait port-forwarding to take its place - sleep 10s - - ray job submit \ - --address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())" - echo "pass" > /workspace/ray_result.txt - allowFailure: true - waitFor: ['create gke cluster'] - - - id: 'cleanup ray cluster' - name: 'gcr.io/$PROJECT_ID/terraform' - entrypoint: 'bash' - args: - - '-c' - - | - set -e - - cd /workspace/applications/ray/ - terraform destroy \ - -var-file=workloads.tfvars \ - -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - -var=cluster_location=$_REGION \ - -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-ray \ - -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ - -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \ - -var=enable_gpu=true \ - -auto-approve -no-color - allowFailure: true - waitFor: ['test ray cluster'] - - - id: 'test jupyterhub' - name: 'gcr.io/$PROJECT_ID/terraform' - entrypoint: 'bash' - args: - - '-c' - - | - set -e - - cd /workspace/modules/jupyter/tests - python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER - - cd /workspace/applications/jupyter - terraform apply \ - -var-file=workloads-without-iap.example.tfvars \ - -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - -var=cluster_location=$_REGION \ - -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \ - -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ - -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \ - -auto-approve -no-color - echo "pass" > /workspace/jupyterhub_tf_result.txt - - kubectl wait --for=condition=Ready pods -n ml-$SHORT_SHA-$_BUILD_ID-jupyter -l 'component!=continuous-image-puller' --timeout=1800s - kubectl get services -n ml-$SHORT_SHA-$_BUILD_ID-jupyter - kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-jupyter service/proxy-public 9442:80 & - # Wait port-forwarding to take its place - sleep 5s - - cd /workspace/modules/jupyter/tests - python3 test_hub.py "127.0.0.1:9442" $_AUTOPILOT_CLUSTER - echo "pass" > /workspace/jupyterhub_test_result.txt - allowFailure: true - waitFor: ['create gke cluster'] - - - id: 'cleanup jupyterhub' - name: 'gcr.io/$PROJECT_ID/terraform' - entrypoint: 'bash' - args: - - '-c' - - | - set -e - - cd /workspace/applications/jupyter/ - terraform destroy \ - -var-file=workloads-without-iap.example.tfvars \ - -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - -var=cluster_location=$_REGION \ - -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \ - -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ - -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \ - -auto-approve -no-color - allowFailure: true - waitFor: ['test jupyterhub'] - - - id: 'test rag' - name: 'gcr.io/$PROJECT_ID/terraform' - entrypoint: 'sh' - secretEnv: ['KAGGLE_USERNAME', 'KAGGLE_KEY'] - args: - - '-c' - - | - set -e - - # Get kube config - gcloud container clusters get-credentials \ - ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - --location $_REGION \ - --project $PROJECT_ID - - cd /workspace/modules/jupyter/tests - python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER - - cd /workspace/applications/rag/ - terraform apply \ - -var-file=workloads.tfvars \ - -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - -var=create_cluster=false \ - -var=jupyter_add_auth=false \ - -var=frontend_add_auth=false \ - -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - -var=cluster_location=$_REGION \ - -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ - -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ - -var=ray_service_account=ray-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ - -var=rag_service_account=rag-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ - -var=jupyter_service_account=jupyter-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ - -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ - -auto-approve -no-color - echo "pass" > /workspace/rag_tf_result.txt - - # Validate Ray: Make sure pods are running - kubectl wait --for=condition=Ready pods -n rag-$SHORT_SHA-$_BUILD_ID -l 'component!=continuous-image-puller' --timeout=1200s - kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/ray-cluster-kuberay-head-svc 8262:8265 & - # Wait port-forwarding to take its place - sleep 5s - - # Validate Ray: Check dashboard - ray job submit --working-dir ./tests \ - --address=http://127.0.0.1:8262 -- python -c "import ray; ray.init(); print(ray.cluster_resources())" - echo "pass" > /workspace/rag_ray_dashboard_result.txt - - # Validate JupyterHub: Get hub url - kubectl get services -n rag-$SHORT_SHA-$_BUILD_ID - kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/proxy-public 9443:80 & - # Wait port-forwarding to take its place - sleep 5s - - # Validate JupyterHub: Test Hub - cd /workspace/modules/jupyter/tests - python3 test_hub.py "127.0.0.1:9443" $_AUTOPILOT_CLUSTER - echo "pass" > /workspace/rag_jupyterhub_test_result.txt - - # Validate RAG: Test rag frontend - kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/rag-frontend 8081:8080 & - # Wait port-forwarding to take its place - sleep 5s - - cd /workspace/applications/rag/tests - python3 test_frontend.py "127.0.0.1:8081" - echo "pass" > /workspace/rag_frontend_result.txt - - cd /workspace/ - sed -i "s//$$KAGGLE_USERNAME/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb - sed -i "s//$$KAGGLE_KEY/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb - gsutil cp ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb gs://gke-aieco-rag-$SHORT_SHA-$_BUILD_ID/ - kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID $(kubectl get pod -l app=jupyterhub,component=hub -n rag-$SHORT_SHA-$_BUILD_ID -o jsonpath="{.items[0].metadata.name}") -- jupyterhub token admin --log-level=CRITICAL | xargs python3 ./applications/rag/notebook_starter.py - # Wait for jupyterhub to trigger notebook pod startup - sleep 5s - kubectl wait --for=condition=Ready pod/jupyter-admin -n rag-$SHORT_SHA-$_BUILD_ID --timeout=500s - kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- jupyter nbconvert --to script /data/rag-kaggle-ray-sql-interactive.ipynb - kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- ipython /data/rag-kaggle-ray-sql-interactive.py - - python3 ./applications/rag/tests/test_rag.py "http://127.0.0.1:8081/prompt" - echo "pass" > /workspace/rag_prompt_result.txt - - allowFailure: true - waitFor: ['create gke cluster'] - - - id: 'cleanup rag' - name: 'gcr.io/$PROJECT_ID/terraform' - entrypoint: 'bash' - args: - - '-c' - - | - set -e - - cd /workspace/applications/rag/ - terraform destroy \ - -var-file=workloads.tfvars \ - -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - -var=create_cluster=false \ - -var=jupyter_add_auth=false \ - -var=frontend_add_auth=false \ - -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - -var=cluster_location=$_REGION \ - -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ - -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ - -var=ray_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ - -var=rag_service_account=rag-sa-$SHORT_SHA-$_BUILD_ID \ - -var=jupyter_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ - -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ - -auto-approve -no-color - allowFailure: true - waitFor: ['test rag'] - - - id: 'cleanup gke cluster' - name: 'gcr.io/$PROJECT_ID/terraform' - entrypoint: 'bash' - args: - - '-c' - - | - set -e - - cd /workspace/infrastructure - terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \ - -var=cluster_location=$_REGION -auto-approve -no-color - - allowFailure: true - waitFor: ['cleanup rag', 'cleanup jupyterhub', 'cleanup ray cluster'] - - - id: 'check result' - name: 'gcr.io/$PROJECT_ID/terraform' - entrypoint: 'bash' - args: - - '-c' - - | - if [[ $(cat /workspace/gke_cluster_result.txt) != "pass" ]]; then - echo "gke cluster creation failed" - exit 1 - fi - - if [[ $(cat /workspace/ray_result.txt) != "pass" ]]; then - echo "ray API run failed" - exit 1 - fi - - if [[ $(cat /workspace/user_result.txt) != "pass" ]]; then - echo "ray cluster failed" - exit 1 - fi - - if [[ $(cat /workspace/jupyterhub_tf_result.txt) != "pass" ]]; then - echo "jupyterhub tf failed" - exit 1 - fi - - if [[ $(cat /workspace/jupyterhub_test_result.txt) != "pass" ]]; then - echo "jupyterhub test failed" - exit 1 - fi - - if [[ $(cat /workspace/rag_tf_result.txt) != "pass" ]]; then - echo "rag tf failed" - exit 1 - fi - - if [[ $(cat /workspace/rag_ray_dashboard_result.txt) != "pass" ]]; then - echo "rag ray dashboard test failed" - exit 1 - fi - - if [[ $(cat /workspace/rag_jupyterhub_test_result.txt) != "pass" ]]; then - echo "rag jupyterhub test failed" - exit 1 - fi - - if [[ $(cat /workspace/rag_frontend_result.txt) != "pass" ]]; then - echo "rag frontend test failed" - exit 1 - fi - - if [[ $(cat /workspace/rag_prompt_result.txt) != "pass" ]]; then - echo "rag prompt test failed" - exit 1 - fi - waitFor: ['cleanup gke cluster'] - + # - id: 'create gke cluster' + # name: 'gcr.io/$PROJECT_ID/terraform' + # env: + # - "KUBE_LOAD_CONFIG_FILE=false" + # entrypoint: 'sh' + # args: + # - '-c' + # - | + # set -e + + # terraform apply \ + # -var-file=tfvars_tests/standard-gke-public.platform.tfvars \ + # -var=project_id=$PROJECT_ID \ + # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + # -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + # -var=subnetwork_region=$_REGION \ + # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + # -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \ + # -var=cluster_location=$_REGION \ + # -var='cpu_pools=[{initial_node_count=2,name="cpu-pool",machine_type="n1-standard-16",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-standard",}]' \ + # -var='gpu_pools=[{initial_node_count=2,name="gpu-pool",machine_type="g2-standard-24",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-balanced",accelerator_count=2,accelerator_type="nvidia-l4",gpu_driver_version="DEFAULT",}]' \ + # -auto-approve -no-color + # echo "pass" > /workspace/gke_cluster_result.txt + # dir: 'infrastructure/' + # allowFailure: true + # waitFor: ['validate platform', 'validate ray', 'validate jupyterhub', validate rag] + + # - id: 'test ray cluster' + # name: 'gcr.io/$PROJECT_ID/terraform' + # entrypoint: 'sh' + # args: + # - '-c' + # - | + # set -e + + # # Get kube config + # gcloud container clusters get-credentials \ + # ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + # --location $_REGION \ + # --project $PROJECT_ID + + # cd /workspace/applications/ray/ + # terraform apply \ + # -var-file=workloads.tfvars \ + # -var=project_id=$PROJECT_ID \ + # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + # -var=cluster_location=$_REGION \ + # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-ray \ + # -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ + # -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \ + # -var=enable_gpu=true \ + # -auto-approve -no-color + # echo "pass" > /workspace/user_result.txt + + # chmod +x /workspace/scripts/ci/wait_for_pods.sh + # /workspace/scripts/ci/wait_for_pods.sh ml-$SHORT_SHA-$_BUILD_ID-ray 3000 + + # kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID-ray --for=condition=Ready --timeout=1200s + # # Ray head's readinessProbe is not probing the head service today. Therefore the wait for ready above is not reliable. + # sleep 60s + # kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-ray service/ray-cluster-kuberay-head-svc 8265:8265 & + # # Wait port-forwarding to take its place + # sleep 10s + + # ray job submit \ + # --address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())" + # echo "pass" > /workspace/ray_result.txt + # allowFailure: true + # waitFor: ['create gke cluster'] + + # - id: 'cleanup ray cluster' + # name: 'gcr.io/$PROJECT_ID/terraform' + # entrypoint: 'bash' + # args: + # - '-c' + # - | + # set -e + + # cd /workspace/applications/ray/ + # terraform destroy \ + # -var-file=workloads.tfvars \ + # -var=project_id=$PROJECT_ID \ + # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + # -var=cluster_location=$_REGION \ + # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-ray \ + # -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ + # -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \ + # -var=enable_gpu=true \ + # -auto-approve -no-color + # allowFailure: true + # waitFor: ['test ray cluster'] + + # - id: 'test jupyterhub' + # name: 'gcr.io/$PROJECT_ID/terraform' + # entrypoint: 'bash' + # args: + # - '-c' + # - | + # set -e + + # cd /workspace/modules/jupyter/tests + # python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER + + # cd /workspace/applications/jupyter + # terraform apply \ + # -var-file=workloads-without-iap.example.tfvars \ + # -var=project_id=$PROJECT_ID \ + # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + # -var=cluster_location=$_REGION \ + # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \ + # -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ + # -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \ + # -auto-approve -no-color + # echo "pass" > /workspace/jupyterhub_tf_result.txt + + # kubectl wait --for=condition=Ready pods -n ml-$SHORT_SHA-$_BUILD_ID-jupyter -l 'component!=continuous-image-puller' --timeout=1800s + # kubectl get services -n ml-$SHORT_SHA-$_BUILD_ID-jupyter + # kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-jupyter service/proxy-public 9442:80 & + # # Wait port-forwarding to take its place + # sleep 5s + + # cd /workspace/modules/jupyter/tests + # python3 test_hub.py "127.0.0.1:9442" $_AUTOPILOT_CLUSTER + # echo "pass" > /workspace/jupyterhub_test_result.txt + # allowFailure: true + # waitFor: ['create gke cluster'] + + # - id: 'cleanup jupyterhub' + # name: 'gcr.io/$PROJECT_ID/terraform' + # entrypoint: 'bash' + # args: + # - '-c' + # - | + # set -e + + # cd /workspace/applications/jupyter/ + # terraform destroy \ + # -var-file=workloads-without-iap.example.tfvars \ + # -var=project_id=$PROJECT_ID \ + # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + # -var=cluster_location=$_REGION \ + # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \ + # -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ + # -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \ + # -auto-approve -no-color + # allowFailure: true + # waitFor: ['test jupyterhub'] + + # - id: 'test rag' + # name: 'gcr.io/$PROJECT_ID/terraform' + # entrypoint: 'sh' + # secretEnv: ['KAGGLE_USERNAME', 'KAGGLE_KEY'] + # args: + # - '-c' + # - | + # set -e + + # # Get kube config + # gcloud container clusters get-credentials \ + # ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + # --location $_REGION \ + # --project $PROJECT_ID + + # cd /workspace/modules/jupyter/tests + # python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER + + # cd /workspace/applications/rag/ + # terraform apply \ + # -var-file=workloads.tfvars \ + # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + # -var=create_cluster=false \ + # -var=jupyter_add_auth=false \ + # -var=frontend_add_auth=false \ + # -var=project_id=$PROJECT_ID \ + # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + # -var=cluster_location=$_REGION \ + # -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ + # -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ + # -var=ray_service_account=ray-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ + # -var=rag_service_account=rag-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ + # -var=jupyter_service_account=jupyter-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ + # -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ + # -auto-approve -no-color + # echo "pass" > /workspace/rag_tf_result.txt + + # # Validate Ray: Make sure pods are running + # kubectl wait --for=condition=Ready pods -n rag-$SHORT_SHA-$_BUILD_ID -l 'component!=continuous-image-puller' --timeout=1200s + # kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/ray-cluster-kuberay-head-svc 8262:8265 & + # # Wait port-forwarding to take its place + # sleep 5s + + # # Validate Ray: Check dashboard + # ray job submit --working-dir ./tests \ + # --address=http://127.0.0.1:8262 -- python -c "import ray; ray.init(); print(ray.cluster_resources())" + # echo "pass" > /workspace/rag_ray_dashboard_result.txt + + # # Validate JupyterHub: Get hub url + # kubectl get services -n rag-$SHORT_SHA-$_BUILD_ID + # kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/proxy-public 9443:80 & + # # Wait port-forwarding to take its place + # sleep 5s + + # # Validate JupyterHub: Test Hub + # cd /workspace/modules/jupyter/tests + # python3 test_hub.py "127.0.0.1:9443" $_AUTOPILOT_CLUSTER + # echo "pass" > /workspace/rag_jupyterhub_test_result.txt + + # # Validate RAG: Test rag frontend + # kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/rag-frontend 8081:8080 & + # # Wait port-forwarding to take its place + # sleep 5s + + # cd /workspace/applications/rag/tests + # python3 test_frontend.py "127.0.0.1:8081" + # echo "pass" > /workspace/rag_frontend_result.txt + + # cd /workspace/ + # sed -i "s//$$KAGGLE_USERNAME/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb + # sed -i "s//$$KAGGLE_KEY/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb + # gsutil cp ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb gs://gke-aieco-rag-$SHORT_SHA-$_BUILD_ID/ + # kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID $(kubectl get pod -l app=jupyterhub,component=hub -n rag-$SHORT_SHA-$_BUILD_ID -o jsonpath="{.items[0].metadata.name}") -- jupyterhub token admin --log-level=CRITICAL | xargs python3 ./applications/rag/notebook_starter.py + # # Wait for jupyterhub to trigger notebook pod startup + # sleep 5s + # kubectl wait --for=condition=Ready pod/jupyter-admin -n rag-$SHORT_SHA-$_BUILD_ID --timeout=500s + # kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- jupyter nbconvert --to script /data/rag-kaggle-ray-sql-interactive.ipynb + # kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- ipython /data/rag-kaggle-ray-sql-interactive.py + + # python3 ./applications/rag/tests/test_rag.py "http://127.0.0.1:8081/prompt" + # echo "pass" > /workspace/rag_prompt_result.txt + + # allowFailure: true + # waitFor: ['create gke cluster'] + + # - id: 'cleanup rag' + # name: 'gcr.io/$PROJECT_ID/terraform' + # entrypoint: 'bash' + # args: + # - '-c' + # - | + # set -e + + # cd /workspace/applications/rag/ + # terraform destroy \ + # -var-file=workloads.tfvars \ + # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + # -var=create_cluster=false \ + # -var=jupyter_add_auth=false \ + # -var=frontend_add_auth=false \ + # -var=project_id=$PROJECT_ID \ + # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + # -var=cluster_location=$_REGION \ + # -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ + # -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ + # -var=ray_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ + # -var=rag_service_account=rag-sa-$SHORT_SHA-$_BUILD_ID \ + # -var=jupyter_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ + # -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ + # -auto-approve -no-color + # allowFailure: true + # waitFor: ['test rag'] + + # - id: 'cleanup gke cluster' + # name: 'gcr.io/$PROJECT_ID/terraform' + # entrypoint: 'bash' + # args: + # - '-c' + # - | + # set -e + + # cd /workspace/infrastructure + # terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \ + # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + # -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + # -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \ + # -var=cluster_location=$_REGION -auto-approve -no-color + + # allowFailure: true + # waitFor: ['cleanup rag', 'cleanup jupyterhub', 'cleanup ray cluster'] + + # - id: 'check result' + # name: 'gcr.io/$PROJECT_ID/terraform' + # entrypoint: 'bash' + # args: + # - '-c' + # - | + # if [[ $(cat /workspace/gke_cluster_result.txt) != "pass" ]]; then + # echo "gke cluster creation failed" + # exit 1 + # fi + + # if [[ $(cat /workspace/ray_result.txt) != "pass" ]]; then + # echo "ray API run failed" + # exit 1 + # fi + + # if [[ $(cat /workspace/user_result.txt) != "pass" ]]; then + # echo "ray cluster failed" + # exit 1 + # fi + + # if [[ $(cat /workspace/jupyterhub_tf_result.txt) != "pass" ]]; then + # echo "jupyterhub tf failed" + # exit 1 + # fi + + # if [[ $(cat /workspace/jupyterhub_test_result.txt) != "pass" ]]; then + # echo "jupyterhub test failed" + # exit 1 + # fi + + # if [[ $(cat /workspace/rag_tf_result.txt) != "pass" ]]; then + # echo "rag tf failed" + # exit 1 + # fi + + # if [[ $(cat /workspace/rag_ray_dashboard_result.txt) != "pass" ]]; then + # echo "rag ray dashboard test failed" + # exit 1 + # fi + + # if [[ $(cat /workspace/rag_jupyterhub_test_result.txt) != "pass" ]]; then + # echo "rag jupyterhub test failed" + # exit 1 + # fi + + # if [[ $(cat /workspace/rag_frontend_result.txt) != "pass" ]]; then + # echo "rag frontend test failed" + # exit 1 + # fi + + # if [[ $(cat /workspace/rag_prompt_result.txt) != "pass" ]]; then + # echo "rag prompt test failed" + # exit 1 + # fi + # waitFor: ['cleanup gke cluster'] + substitutions: _REGION: us-east4 _USER_NAME: github @@ -390,13 +390,13 @@ substitutions: _BUILD_ID: ${BUILD_ID:0:8} logsBucket: gs://ai-on-gke-build-logs options: - substitutionOption: 'ALLOW_LOOSE' - machineType: 'E2_HIGHCPU_8' + substitutionOption: "ALLOW_LOOSE" + machineType: "E2_HIGHCPU_8" timeout: 5400s availableSecrets: secretManager: - - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-username/versions/latest - env: 'KAGGLE_USERNAME' - - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-key/versions/latest - env: 'KAGGLE_KEY' + - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-username/versions/latest + env: "KAGGLE_USERNAME" + - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-key/versions/latest + env: "KAGGLE_KEY"