From d19c994ac6bb7f4d30bbbc674e5700e26d5d65e1 Mon Sep 17 00:00:00 2001
From: Gen Lu <genlu@google.com>
Date: Wed, 18 Sep 2024 23:37:51 +0000
Subject: [PATCH] test

Change-Id: Ie80a37694fe35e98e801b0c5a425f7793837b092
---
 applications/jupyter/main.tf                  |   9 +-
 applications/jupyter/workloads.tfvars         |  15 +-
 .../rag-kaggle-ray-sql-interactive.ipynb      |  79 +--
 cloudbuild.yaml                               | 509 +++++-------------
 4 files changed, 183 insertions(+), 429 deletions(-)
diff --git a/applications/jupyter/main.tf b/applications/jupyter/main.tf
index 3998a5255..8469f8dbf 100644
--- a/applications/jupyter/main.tf
+++ b/applications/jupyter/main.tf
@@ -122,13 +122,6 @@ provider "helm" {
   }
 }
 
-module "gcs" {
-  source      = "../../modules/gcs"
-  count       = var.create_gcs_bucket ? 1 : 0
-  project_id  = var.project_id
-  bucket_name = var.gcs_bucket
-}
-
 # create namespace
 module "namespace" {
   source           = "../../modules/kubernetes-namespace"
@@ -164,5 +157,5 @@ module "jupyterhub" {
   k8s_backend_service_port = var.k8s_backend_service_port
   domain                   = var.domain
   members_allowlist        = var.members_allowlist != "" ? split(",", var.members_allowlist) : []
-  depends_on               = [module.gcs, module.namespace]
+  depends_on               = [module.namespace]
 }
diff --git a/applications/jupyter/workloads.tfvars b/applications/jupyter/workloads.tfvars
index a472bd98c..cb0195232 100644
--- a/applications/jupyter/workloads.tfvars
+++ b/applications/jupyter/workloads.tfvars
@@ -14,17 +14,17 @@
 
 ##common variables
 ## Need to pull this variables from tf output from previous infrastructure stage
-project_id = "<your project ID>"
+project_id = "gke-ai-eco-dev"
 
 ## This is required for terraform to connect to GKE cluster and deploy workloads.
-cluster_name     = "ml-cluster"
-cluster_location = "us-central1"
+cluster_name     = "ml-a236619-827-2ec62154-cluster"
+cluster_location = "us-east4"
 
 ## If terraform should create a new GKE cluster, fill in this section as well.
 ##    By default, a public autopilot GKE cluster will be created in the default network.
 ##    Set the autopilot_cluster variable to false to create a standard cluster instead.
 create_cluster        = false
-autopilot_cluster     = true
+autopilot_cluster     = false
 cluster_membership_id = "" # required for private cluster, defaults to `cluster_name`
 
 #######################################################
@@ -32,9 +32,10 @@ cluster_membership_id = "" # required for private cluster, defaults to `cluster_
 #######################################################
 
 ## JupyterHub variables
-kubernetes_namespace              = "ai-on-gke"
-create_gcs_bucket                 = true
-gcs_bucket                        = "gcs-bucket-<unique-suffix>" # Choose a globally unique bucket name.
+kubernetes_namespace = "jupyter-test"
+create_gcs_bucket    = false
+gcs_bucket           = "gke-aieco-rag-a236619-2ec62154"
+# Choose a globally unique bucket name.
 workload_identity_service_account = "jupyter-sa"
 
 # IAP Configs
diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb
index 2b80e437e..05f99ff45 100644
--- a/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb
+++ b/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb
@@ -37,9 +37,9 @@
    "id": "c7ff518d-f4d2-481b-b408-2c2507565611",
    "metadata": {},
    "source": [
-    "## Creating the Database Connection\n",
+    "## Download the Data\n",
     "\n",
-    "Let's now set up a connection to your CloudSQL database:"
+    "Let's now import required modules:"
    ]
   },
   {
@@ -60,42 +60,7 @@
     "from datasets import load_dataset_builder, load_dataset, Dataset\n",
     "from huggingface_hub import snapshot_download\n",
     "from google.cloud.sql.connector import Connector, IPTypes\n",
-    "import sqlalchemy\n",
-    "\n",
-    "# initialize parameters\n",
-    "\n",
-    "INSTANCE_CONNECTION_NAME = os.environ[\"CLOUDSQL_INSTANCE_CONNECTION_NAME\"]\n",
-    "print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n",
-    "DB_NAME = \"pgvector-database\"\n",
-    "\n",
-    "db_username_file = open(\"/etc/secret-volume/username\", \"r\")\n",
-    "DB_USER = db_username_file.read()\n",
-    "db_username_file.close()\n",
-    "\n",
-    "db_password_file = open(\"/etc/secret-volume/password\", \"r\")\n",
-    "DB_PASS = db_password_file.read()\n",
-    "db_password_file.close()\n",
-    "\n",
-    "# initialize Connector object\n",
-    "connector = Connector()\n",
-    "\n",
-    "# function to return the database connection object\n",
-    "def getconn():\n",
-    "    conn = connector.connect(\n",
-    "        INSTANCE_CONNECTION_NAME,\n",
-    "        \"pg8000\",\n",
-    "        user=DB_USER,\n",
-    "        password=DB_PASS,\n",
-    "        db=DB_NAME,\n",
-    "        ip_type=IPTypes.PRIVATE\n",
-    "    )\n",
-    "    return conn\n",
-    "\n",
-    "# create connection pool with 'creator' argument to our connection object function\n",
-    "pool = sqlalchemy.create_engine(\n",
-    "    \"postgresql+pg8000://\",\n",
-    "    creator=getconn,\n",
-    ")"
+    "import sqlalchemy"
    ]
   },
   {
@@ -150,7 +115,9 @@
     "os.makedirs(SHARED_DATA_BASEPATH, exist_ok=True)\n",
     "\n",
     "# One time download of the sentence transformer model to a shared persistent storage available to the ray workers\n",
-    "snapshot_download(repo_id=SENTENCE_TRANSFORMER_MODEL, revision=SENTENCE_TRANSFORMER_MODEL_SNAPSHOT, cache_dir=SHARED_DATA_BASEPATH)"
+    "snapshot_download(repo_id=SENTENCE_TRANSFORMER_MODEL, revision=SENTENCE_TRANSFORMER_MODEL_SNAPSHOT, cache_dir=\"~/data/rag\")\n",
+    "\n",
+    "!cp -r \"~/data/rag\" \"/data/rag/st\""
    ]
   },
   {
@@ -322,6 +289,40 @@
     "from sqlalchemy.orm import scoped_session, sessionmaker, mapped_column\n",
     "from pgvector.sqlalchemy import Vector\n",
     "\n",
+    "# initialize parameters\n",
+    "\n",
+    "INSTANCE_CONNECTION_NAME = os.environ[\"CLOUDSQL_INSTANCE_CONNECTION_NAME\"]\n",
+    "print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n",
+    "DB_NAME = \"pgvector-database\"\n",
+    "\n",
+    "db_username_file = open(\"/etc/secret-volume/username\", \"r\")\n",
+    "DB_USER = db_username_file.read()\n",
+    "db_username_file.close()\n",
+    "\n",
+    "db_password_file = open(\"/etc/secret-volume/password\", \"r\")\n",
+    "DB_PASS = db_password_file.read()\n",
+    "db_password_file.close()\n",
+    "\n",
+    "# initialize Connector object\n",
+    "connector = Connector()\n",
+    "\n",
+    "# function to return the database connection object\n",
+    "def getconn():\n",
+    "    conn = connector.connect(\n",
+    "        INSTANCE_CONNECTION_NAME,\n",
+    "        \"pg8000\",\n",
+    "        user=DB_USER,\n",
+    "        password=DB_PASS,\n",
+    "        db=DB_NAME,\n",
+    "        ip_type=IPTypes.PRIVATE\n",
+    "    )\n",
+    "    return conn\n",
+    "\n",
+    "# create connection pool with 'creator' argument to our connection object function\n",
+    "pool = sqlalchemy.create_engine(\n",
+    "    \"postgresql+pg8000://\",\n",
+    "    creator=getconn,\n",
+    ")\n",
     "\n",
     "Base = declarative_base()\n",
     "DBSession = scoped_session(sessionmaker())\n",
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index d5ea41498..79f554d65 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -13,375 +13,134 @@
 # limitations under the License.
 
 steps:
-  - id: "validate platform"
-    name: "gcr.io/$PROJECT_ID/terraform"
-    script: |
-      terraform init -no-color
-      terraform validate -no-color
-    dir: "infrastructure/"
-    waitFor: ["-"]
-
-  - id: "validate ray"
-    name: "gcr.io/$PROJECT_ID/terraform"
-    script: |
-      terraform init -no-color
-      terraform validate -no-color
-    dir: "applications/ray/"
-    waitFor: ["validate platform"]
-
-  - id: "validate jupyterhub"
-    name: "gcr.io/$PROJECT_ID/terraform"
-    script: |
-      terraform init -no-color
-      terraform validate -no-color
-    dir: "applications/jupyter/"
-    waitFor: ["validate platform"]
-
-  - id: "validate rag"
-    name: "gcr.io/$PROJECT_ID/terraform"
-    script: |
-      terraform init -no-color
-      terraform validate -no-color
-    dir: "applications/rag/"
-    waitFor: ["validate platform"]
-
-  # Create cluster to test ray, jupyterhub, rag
-  # - id: 'create gke cluster'
-  #   name: 'gcr.io/$PROJECT_ID/terraform'
-  #   env:
-  #     - "KUBE_LOAD_CONFIG_FILE=false"
-  #   entrypoint: 'sh'
-  #   args:
-  #     - '-c'
-  #     - |
-  #       set -e
-
-  #       terraform apply \
-  #       -var-file=tfvars_tests/standard-gke-public.platform.tfvars \
-  #       -var=project_id=$PROJECT_ID \
-  #       -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER  \
-  #       -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER  \
-  #       -var=subnetwork_region=$_REGION \
-  #       -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-  #       -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-  #       -var=cluster_location=$_REGION \
-  #       -var='cpu_pools=[{initial_node_count=2,name="cpu-pool",machine_type="n1-standard-16",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-standard",}]' \
-  #       -var='gpu_pools=[{initial_node_count=2,name="gpu-pool",machine_type="g2-standard-24",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-balanced",accelerator_count=2,accelerator_type="nvidia-l4",gpu_driver_version="DEFAULT",}]' \
-  #       -auto-approve -no-color
-  #       echo "pass" > /workspace/gke_cluster_result.txt
-  #   dir: 'infrastructure/'
-  #   allowFailure: true
-  #   waitFor: ['validate platform', 'validate ray', 'validate jupyterhub', validate rag]
-
-  # - id: 'test ray cluster'
-  #   name: 'gcr.io/$PROJECT_ID/terraform'
-  #   entrypoint: 'sh'
-  #   args:
-  #     - '-c'
-  #     - |
-  #       set -e
-
-  #       # Get kube config
-  #       gcloud container clusters get-credentials \
-  #       ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-  #       --location $_REGION \
-  #       --project $PROJECT_ID
-
-  #       cd /workspace/applications/ray/
-  #       terraform apply \
-  #       -var-file=workloads.tfvars \
-  #       -var=project_id=$PROJECT_ID \
-  #       -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-  #       -var=cluster_location=$_REGION \
-  #       -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-ray \
-  #       -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
-  #       -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \
-  #       -var=enable_gpu=true \
-  #       -auto-approve -no-color
-  #       echo "pass" > /workspace/user_result.txt
-
-  #       chmod +x /workspace/scripts/ci/wait_for_pods.sh
-  #       /workspace/scripts/ci/wait_for_pods.sh ml-$SHORT_SHA-$_BUILD_ID-ray 3000
-
-  #       kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID-ray --for=condition=Ready --timeout=1200s
-  #       # Ray head's readinessProbe is not probing the head service today. Therefore the wait for ready above is not reliable.
-  #       sleep 60s
-  #       kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-ray service/ray-cluster-kuberay-head-svc 8265:8265 &
-  #       # Wait port-forwarding to take its place
-  #       sleep 10s
-
-  #       ray job submit \
-  #       --address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"
-  #       echo "pass" > /workspace/ray_result.txt
-  #   allowFailure: true
-  #   waitFor: ['create gke cluster']
-
-  # - id: 'cleanup ray cluster'
-  #   name: 'gcr.io/$PROJECT_ID/terraform'
-  #   entrypoint: 'bash'
-  #   args:
-  #     - '-c'
-  #     - |
-  #       set -e
-
-  #       cd /workspace/applications/ray/
-  #       terraform destroy \
-  #       -var-file=workloads.tfvars \
-  #       -var=project_id=$PROJECT_ID \
-  #       -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-  #       -var=cluster_location=$_REGION \
-  #       -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-ray \
-  #       -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
-  #       -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \
-  #       -var=enable_gpu=true \
-  #       -auto-approve -no-color
-  #   allowFailure: true
-  #   waitFor: ['test ray cluster']
-
-  # - id: 'test jupyterhub'
-  #   name: 'gcr.io/$PROJECT_ID/terraform'
-  #   entrypoint: 'bash'
-  #   args:
-  #     - '-c'
-  #     - |
-  #       set -e
-
-  #       cd /workspace/modules/jupyter/tests
-  #       python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER
-
-  #       cd /workspace/applications/jupyter
-  #       terraform apply \
-  #       -var-file=workloads-without-iap.example.tfvars \
-  #       -var=project_id=$PROJECT_ID \
-  #       -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-  #       -var=cluster_location=$_REGION \
-  #       -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \
-  #       -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-  #       -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \
-  #       -auto-approve -no-color
-  #       echo "pass" > /workspace/jupyterhub_tf_result.txt
-
-  #       kubectl wait --for=condition=Ready pods -n ml-$SHORT_SHA-$_BUILD_ID-jupyter -l 'component!=continuous-image-puller' --timeout=1800s
-  #       kubectl get services -n ml-$SHORT_SHA-$_BUILD_ID-jupyter
-  #       kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-jupyter service/proxy-public 9442:80 &
-  #       # Wait port-forwarding to take its place
-  #       sleep 5s
-
-  #       cd /workspace/modules/jupyter/tests
-  #       python3 test_hub.py "127.0.0.1:9442" $_AUTOPILOT_CLUSTER
-  #       echo "pass" > /workspace/jupyterhub_test_result.txt
-  #   allowFailure: true
-  #   waitFor: ['create gke cluster']
-
-  # - id: 'cleanup jupyterhub'
-  #   name: 'gcr.io/$PROJECT_ID/terraform'
-  #   entrypoint: 'bash'
-  #   args:
-  #     - '-c'
-  #     - |
-  #       set -e
-
-  #       cd /workspace/applications/jupyter/
-  #       terraform destroy \
-  #       -var-file=workloads-without-iap.example.tfvars \
-  #       -var=project_id=$PROJECT_ID \
-  #       -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-  #       -var=cluster_location=$_REGION \
-  #       -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \
-  #       -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-  #       -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \
-  #       -auto-approve -no-color
-  #   allowFailure: true
-  #   waitFor: ['test jupyterhub']
-
-  # - id: 'test rag'
-  #   name: 'gcr.io/$PROJECT_ID/terraform'
-  #   entrypoint: 'sh'
-  #   secretEnv: ['KAGGLE_USERNAME', 'KAGGLE_KEY']
-  #   args:
-  #     - '-c'
-  #     - |
-  #       set -e
-
-  #       # Get kube config
-  #       gcloud container clusters get-credentials \
-  #       ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-  #       --location $_REGION \
-  #       --project $PROJECT_ID
-
-  #       cd /workspace/modules/jupyter/tests
-  #       python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER
-
-  #       cd /workspace/applications/rag/
-  #       terraform apply \
-  #       -var-file=workloads.tfvars \
-  #       -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER  \
-  #       -var=create_cluster=false \
-  #       -var=jupyter_add_auth=false \
-  #       -var=frontend_add_auth=false \
-  #       -var=project_id=$PROJECT_ID \
-  #       -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-  #       -var=cluster_location=$_REGION \
-  #       -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \
-  #       -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \
-  #       -var=ray_service_account=ray-sa-4-rag-$SHORT_SHA-$_BUILD_ID \
-  #       -var=rag_service_account=rag-sa-4-rag-$SHORT_SHA-$_BUILD_ID \
-  #       -var=jupyter_service_account=jupyter-sa-4-rag-$SHORT_SHA-$_BUILD_ID \
-  #       -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \
-  #       -auto-approve -no-color
-  #       echo "pass" > /workspace/rag_tf_result.txt
-
-  #       # Validate Ray: Make sure pods are running
-  #       kubectl wait --for=condition=Ready pods -n rag-$SHORT_SHA-$_BUILD_ID -l 'component!=continuous-image-puller' --timeout=1200s
-  #       kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/ray-cluster-kuberay-head-svc 8262:8265 &
-  #       # Wait port-forwarding to take its place
-  #       sleep 5s
-
-  #       # Validate Ray: Check dashboard
-  #       ray job submit --working-dir ./tests \
-  #       --address=http://127.0.0.1:8262 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"
-  #       echo "pass" > /workspace/rag_ray_dashboard_result.txt
-
-  #       # Validate JupyterHub: Get hub url
-  #       kubectl get services -n rag-$SHORT_SHA-$_BUILD_ID
-  #       kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/proxy-public 9443:80 &
-  #       # Wait port-forwarding to take its place
-  #       sleep 5s
-
-  #       # Validate JupyterHub: Test Hub
-  #       cd /workspace/modules/jupyter/tests
-  #       python3 test_hub.py "127.0.0.1:9443" $_AUTOPILOT_CLUSTER
-  #       echo "pass" > /workspace/rag_jupyterhub_test_result.txt
-
-  #       # Validate RAG: Test rag frontend
-  #       kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/rag-frontend 8081:8080 &
-  #       # Wait port-forwarding to take its place
-  #       sleep 5s
-
-  #       cd /workspace/applications/rag/tests
-  #       python3 test_frontend.py "127.0.0.1:8081"
-  #       echo "pass" > /workspace/rag_frontend_result.txt
-
-  #       cd /workspace/
-  #       sed -i "s/<username>/$$KAGGLE_USERNAME/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb
-  #       sed -i "s/<token>/$$KAGGLE_KEY/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb
-  #       gsutil cp ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb gs://gke-aieco-rag-$SHORT_SHA-$_BUILD_ID/
-  #       kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID $(kubectl get pod -l app=jupyterhub,component=hub -n rag-$SHORT_SHA-$_BUILD_ID -o jsonpath="{.items[0].metadata.name}") -- jupyterhub token admin --log-level=CRITICAL | xargs python3 ./applications/rag/notebook_starter.py
-  #       # Wait for jupyterhub to trigger notebook pod startup
-  #       sleep 5s
-  #       kubectl wait --for=condition=Ready pod/jupyter-admin -n rag-$SHORT_SHA-$_BUILD_ID --timeout=500s
-  #       kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- jupyter nbconvert --to script /data/rag-kaggle-ray-sql-interactive.ipynb
-  #       kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- ipython /data/rag-kaggle-ray-sql-interactive.py
-
-  #       python3 ./applications/rag/tests/test_rag.py "http://127.0.0.1:8081/prompt"
-  #       echo "pass" > /workspace/rag_prompt_result.txt
-
-  #   allowFailure: true
-  #   waitFor: ['create gke cluster']
-
-  # - id: 'cleanup rag'
-  #   name: 'gcr.io/$PROJECT_ID/terraform'
-  #   entrypoint: 'bash'
-  #   args:
-  #     - '-c'
-  #     - |
-  #       set -e
-
-  #       cd /workspace/applications/rag/
-  #       terraform destroy \
-  #       -var-file=workloads.tfvars \
-  #       -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER  \
-  #       -var=create_cluster=false \
-  #       -var=jupyter_add_auth=false \
-  #       -var=frontend_add_auth=false \
-  #       -var=project_id=$PROJECT_ID \
-  #       -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-  #       -var=cluster_location=$_REGION \
-  #       -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \
-  #       -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \
-  #       -var=ray_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
-  #       -var=rag_service_account=rag-sa-$SHORT_SHA-$_BUILD_ID \
-  #       -var=jupyter_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-  #       -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \
-  #       -auto-approve -no-color
-  #   allowFailure: true
-  #   waitFor: ['test rag']
-
-  # - id: 'cleanup gke cluster'
-  #   name: 'gcr.io/$PROJECT_ID/terraform'
-  #   entrypoint: 'bash'
-  #   args:
-  #     - '-c'
-  #     - |
-  #       set -e
-
-  #       cd /workspace/infrastructure
-  #       terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \
-  #       -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-  #       -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER  \
-  #       -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER  \
-  #       -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-  #       -var=cluster_location=$_REGION -auto-approve -no-color
-
-  #   allowFailure: true
-  #   waitFor: ['cleanup rag', 'cleanup jupyterhub', 'cleanup ray cluster']
-
-  # - id: 'check result'
-  #   name: 'gcr.io/$PROJECT_ID/terraform'
-  #   entrypoint: 'bash'
-  #   args:
-  #     - '-c'
-  #     - |
-  #       if [[ $(cat /workspace/gke_cluster_result.txt) != "pass" ]]; then
-  #         echo "gke cluster creation failed"
-  #         exit 1
-  #       fi
-
-  #       if [[ $(cat /workspace/ray_result.txt) != "pass" ]]; then
-  #         echo "ray API run failed"
-  #         exit 1
-  #       fi
-
-  #       if [[ $(cat /workspace/user_result.txt) != "pass" ]]; then
-  #         echo "ray cluster failed"
-  #         exit 1
-  #       fi
-
-  #       if [[ $(cat /workspace/jupyterhub_tf_result.txt) != "pass" ]]; then
-  #         echo "jupyterhub tf failed"
-  #         exit 1
-  #       fi
-
-  #       if [[ $(cat /workspace/jupyterhub_test_result.txt) != "pass" ]]; then
-  #         echo "jupyterhub test failed"
-  #         exit 1
-  #       fi
-
-  #       if [[ $(cat /workspace/rag_tf_result.txt) != "pass" ]]; then
-  #         echo "rag tf failed"
-  #         exit 1
-  #       fi
-
-  #       if [[ $(cat /workspace/rag_ray_dashboard_result.txt) != "pass" ]]; then
-  #         echo "rag ray dashboard test failed"
-  #         exit 1
-  #       fi
-
-  #       if [[ $(cat /workspace/rag_jupyterhub_test_result.txt) != "pass" ]]; then
-  #         echo "rag jupyterhub test failed"
-  #         exit 1
-  #       fi
-
-  #       if [[ $(cat /workspace/rag_frontend_result.txt) != "pass" ]]; then
-  #         echo "rag frontend test failed"
-  #         exit 1
-  #       fi
-
-  #       if [[ $(cat /workspace/rag_prompt_result.txt) != "pass" ]]; then
-  #         echo "rag prompt test failed"
-  #         exit 1
-  #       fi
-  #   waitFor: ['cleanup gke cluster']
+- id: "validate platform"
+  name: "gcr.io/$PROJECT_ID/terraform"
+  script: |
+    terraform init -no-color
+    terraform validate -no-color
+  dir: "infrastructure/"
+  waitFor: ["-"]
+
+- id: "validate rag"
+  name: "gcr.io/$PROJECT_ID/terraform"
+  script: |
+    terraform init -no-color
+    terraform validate -no-color
+  dir: "applications/rag/"
+  waitFor: ["validate platform"]
+
+# Create cluster to test ray, jupyterhub, rag
+- id: 'create gke cluster'
+  name: 'gcr.io/$PROJECT_ID/terraform'
+  env:
+  - "KUBE_LOAD_CONFIG_FILE=false"
+  entrypoint: 'sh'
+  args:
+  - '-c'
+  - |
+    set -e
+
+    terraform apply \
+    -var-file=tfvars_tests/standard-gke-public.platform.tfvars \
+    -var=project_id=$PROJECT_ID \
+    -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER  \
+    -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER  \
+    -var=subnetwork_region=$_REGION \
+    -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
+    -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
+    -var=cluster_location=$_REGION \
+    -var='cpu_pools=[{initial_node_count=2,name="cpu-pool",machine_type="n1-standard-16",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-standard",}]' \
+    -var='gpu_pools=[{initial_node_count=2,name="gpu-pool",machine_type="g2-standard-24",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-balanced",accelerator_count=2,accelerator_type="nvidia-l4",gpu_driver_version="DEFAULT",}]' \
+    -auto-approve -no-color
+    echo "pass" > /workspace/gke_cluster_result.txt
+  dir: 'infrastructure/'
+  allowFailure: true
+  waitFor: ['validate platform', 'validate rag']
+
+- id: 'test rag'
+  name: 'gcr.io/$PROJECT_ID/terraform'
+  entrypoint: 'sh'
+  secretEnv: ['KAGGLE_USERNAME', 'KAGGLE_KEY']
+  args:
+  - '-c'
+  - |
+    set -e
+
+    # Get kube config
+    gcloud container clusters get-credentials \
+    ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
+    --location $_REGION \
+    --project $PROJECT_ID
+
+    cd /workspace/modules/jupyter/tests
+    python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER
+
+    cd /workspace/applications/rag/
+    terraform apply \
+    -var-file=workloads.tfvars \
+    -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER  \
+    -var=create_cluster=false \
+    -var=jupyter_add_auth=false \
+    -var=frontend_add_auth=false \
+    -var=project_id=$PROJECT_ID \
+    -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
+    -var=cluster_location=$_REGION \
+    -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \
+    -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \
+    -var=ray_service_account=ray-sa-4-rag-$SHORT_SHA-$_BUILD_ID \
+    -var=rag_service_account=rag-sa-4-rag-$SHORT_SHA-$_BUILD_ID \
+    -var=jupyter_service_account=jupyter-sa-4-rag-$SHORT_SHA-$_BUILD_ID \
+    -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \
+    -auto-approve -no-color
+    echo "pass" > /workspace/rag_tf_result.txt
+
+    # Validate Ray: Make sure pods are running
+    kubectl wait --for=condition=Ready pods -n rag-$SHORT_SHA-$_BUILD_ID -l 'component!=continuous-image-puller' --timeout=1200s
+    kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/ray-cluster-kuberay-head-svc 8262:8265 &
+    # Wait port-forwarding to take its place
+    sleep 5s
+
+    # Validate Ray: Check dashboard
+    ray job submit --working-dir ./tests \
+    --address=http://127.0.0.1:8262 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"
+    echo "pass" > /workspace/rag_ray_dashboard_result.txt
+
+    # Validate JupyterHub: Get hub url
+    kubectl get services -n rag-$SHORT_SHA-$_BUILD_ID
+    kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/proxy-public 9443:80 &
+    # Wait port-forwarding to take its place
+    sleep 5s
+
+    # Validate JupyterHub: Test Hub
+    cd /workspace/modules/jupyter/tests
+    python3 test_hub.py "127.0.0.1:9443" $_AUTOPILOT_CLUSTER
+    echo "pass" > /workspace/rag_jupyterhub_test_result.txt
+
+    # Validate RAG: Test rag frontend
+    kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/rag-frontend 8081:8080 &
+    # Wait port-forwarding to take its place
+    sleep 5s
+
+    cd /workspace/applications/rag/tests
+    python3 test_frontend.py "127.0.0.1:8081"
+    echo "pass" > /workspace/rag_frontend_result.txt
+
+    cd /workspace/
+    sed -i "s/<username>/$$KAGGLE_USERNAME/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb
+    sed -i "s/<token>/$$KAGGLE_KEY/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb
+    gsutil cp ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb gs://gke-aieco-rag-$SHORT_SHA-$_BUILD_ID/
+    kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID $(kubectl get pod -l app=jupyterhub,component=hub -n rag-$SHORT_SHA-$_BUILD_ID -o jsonpath="{.items[0].metadata.name}") -- jupyterhub token admin --log-level=CRITICAL | xargs python3 ./applications/rag/notebook_starter.py
+    # Wait for jupyterhub to trigger notebook pod startup
+    sleep 5s
+    kubectl wait --for=condition=Ready pod/jupyter-admin -n rag-$SHORT_SHA-$_BUILD_ID --timeout=500s
+    kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- jupyter nbconvert --to script /data/rag-kaggle-ray-sql-interactive.ipynb
+    kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- ipython /data/rag-kaggle-ray-sql-interactive.py
+
+    python3 ./applications/rag/tests/test_rag.py "http://127.0.0.1:8081/prompt"
+    echo "pass" > /workspace/rag_prompt_result.txt
+
+  allowFailure: true
+  waitFor: ['create gke cluster']
 
 substitutions:
   _REGION: us-east4
@@ -393,9 +152,9 @@ options:
   substitutionOption: "ALLOW_LOOSE"
   machineType: "E2_HIGHCPU_8"
 timeout: 5400s
-# availableSecrets:
-#   secretManager:
-#     - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-username/versions/latest
-#       env: "KAGGLE_USERNAME"
-#     - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-key/versions/latest
-#       env: "KAGGLE_KEY"
+availableSecrets:
+  secretManager:
+  - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-username/versions/latest
+    env: "KAGGLE_USERNAME"
+  - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-key/versions/latest
+    env: "KAGGLE_KEY"