From d19c994ac6bb7f4d30bbbc674e5700e26d5d65e1 Mon Sep 17 00:00:00 2001 From: Gen Lu Date: Wed, 18 Sep 2024 23:37:51 +0000 Subject: [PATCH] test Change-Id: Ie80a37694fe35e98e801b0c5a425f7793837b092 --- applications/jupyter/main.tf | 9 +- applications/jupyter/workloads.tfvars | 15 +- .../rag-kaggle-ray-sql-interactive.ipynb | 79 +-- cloudbuild.yaml | 509 +++++------------- 4 files changed, 183 insertions(+), 429 deletions(-) diff --git a/applications/jupyter/main.tf b/applications/jupyter/main.tf index 3998a5255..8469f8dbf 100644 --- a/applications/jupyter/main.tf +++ b/applications/jupyter/main.tf @@ -122,13 +122,6 @@ provider "helm" { } } -module "gcs" { - source = "../../modules/gcs" - count = var.create_gcs_bucket ? 1 : 0 - project_id = var.project_id - bucket_name = var.gcs_bucket -} - # create namespace module "namespace" { source = "../../modules/kubernetes-namespace" @@ -164,5 +157,5 @@ module "jupyterhub" { k8s_backend_service_port = var.k8s_backend_service_port domain = var.domain members_allowlist = var.members_allowlist != "" ? split(",", var.members_allowlist) : [] - depends_on = [module.gcs, module.namespace] + depends_on = [module.namespace] } diff --git a/applications/jupyter/workloads.tfvars b/applications/jupyter/workloads.tfvars index a472bd98c..cb0195232 100644 --- a/applications/jupyter/workloads.tfvars +++ b/applications/jupyter/workloads.tfvars @@ -14,17 +14,17 @@ ##common variables ## Need to pull this variables from tf output from previous infrastructure stage -project_id = "" +project_id = "gke-ai-eco-dev" ## This is required for terraform to connect to GKE cluster and deploy workloads. -cluster_name = "ml-cluster" -cluster_location = "us-central1" +cluster_name = "ml-a236619-827-2ec62154-cluster" +cluster_location = "us-east4" ## If terraform should create a new GKE cluster, fill in this section as well. ## By default, a public autopilot GKE cluster will be created in the default network. ## Set the autopilot_cluster variable to false to create a standard cluster instead. create_cluster = false -autopilot_cluster = true +autopilot_cluster = false cluster_membership_id = "" # required for private cluster, defaults to `cluster_name` ####################################################### @@ -32,9 +32,10 @@ cluster_membership_id = "" # required for private cluster, defaults to `cluster_ ####################################################### ## JupyterHub variables -kubernetes_namespace = "ai-on-gke" -create_gcs_bucket = true -gcs_bucket = "gcs-bucket-" # Choose a globally unique bucket name. +kubernetes_namespace = "jupyter-test" +create_gcs_bucket = false +gcs_bucket = "gke-aieco-rag-a236619-2ec62154" +# Choose a globally unique bucket name. workload_identity_service_account = "jupyter-sa" # IAP Configs diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb index 2b80e437e..05f99ff45 100644 --- a/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb +++ b/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb @@ -37,9 +37,9 @@ "id": "c7ff518d-f4d2-481b-b408-2c2507565611", "metadata": {}, "source": [ - "## Creating the Database Connection\n", + "## Download the Data\n", "\n", - "Let's now set up a connection to your CloudSQL database:" + "Let's now import required modules:" ] }, { @@ -60,42 +60,7 @@ "from datasets import load_dataset_builder, load_dataset, Dataset\n", "from huggingface_hub import snapshot_download\n", "from google.cloud.sql.connector import Connector, IPTypes\n", - "import sqlalchemy\n", - "\n", - "# initialize parameters\n", - "\n", - "INSTANCE_CONNECTION_NAME = os.environ[\"CLOUDSQL_INSTANCE_CONNECTION_NAME\"]\n", - "print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n", - "DB_NAME = \"pgvector-database\"\n", - "\n", - "db_username_file = open(\"/etc/secret-volume/username\", \"r\")\n", - "DB_USER = db_username_file.read()\n", - "db_username_file.close()\n", - "\n", - "db_password_file = open(\"/etc/secret-volume/password\", \"r\")\n", - "DB_PASS = db_password_file.read()\n", - "db_password_file.close()\n", - "\n", - "# initialize Connector object\n", - "connector = Connector()\n", - "\n", - "# function to return the database connection object\n", - "def getconn():\n", - " conn = connector.connect(\n", - " INSTANCE_CONNECTION_NAME,\n", - " \"pg8000\",\n", - " user=DB_USER,\n", - " password=DB_PASS,\n", - " db=DB_NAME,\n", - " ip_type=IPTypes.PRIVATE\n", - " )\n", - " return conn\n", - "\n", - "# create connection pool with 'creator' argument to our connection object function\n", - "pool = sqlalchemy.create_engine(\n", - " \"postgresql+pg8000://\",\n", - " creator=getconn,\n", - ")" + "import sqlalchemy" ] }, { @@ -150,7 +115,9 @@ "os.makedirs(SHARED_DATA_BASEPATH, exist_ok=True)\n", "\n", "# One time download of the sentence transformer model to a shared persistent storage available to the ray workers\n", - "snapshot_download(repo_id=SENTENCE_TRANSFORMER_MODEL, revision=SENTENCE_TRANSFORMER_MODEL_SNAPSHOT, cache_dir=SHARED_DATA_BASEPATH)" + "snapshot_download(repo_id=SENTENCE_TRANSFORMER_MODEL, revision=SENTENCE_TRANSFORMER_MODEL_SNAPSHOT, cache_dir=\"~/data/rag\")\n", + "\n", + "!cp -r \"~/data/rag\" \"/data/rag/st\"" ] }, { @@ -322,6 +289,40 @@ "from sqlalchemy.orm import scoped_session, sessionmaker, mapped_column\n", "from pgvector.sqlalchemy import Vector\n", "\n", + "# initialize parameters\n", + "\n", + "INSTANCE_CONNECTION_NAME = os.environ[\"CLOUDSQL_INSTANCE_CONNECTION_NAME\"]\n", + "print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n", + "DB_NAME = \"pgvector-database\"\n", + "\n", + "db_username_file = open(\"/etc/secret-volume/username\", \"r\")\n", + "DB_USER = db_username_file.read()\n", + "db_username_file.close()\n", + "\n", + "db_password_file = open(\"/etc/secret-volume/password\", \"r\")\n", + "DB_PASS = db_password_file.read()\n", + "db_password_file.close()\n", + "\n", + "# initialize Connector object\n", + "connector = Connector()\n", + "\n", + "# function to return the database connection object\n", + "def getconn():\n", + " conn = connector.connect(\n", + " INSTANCE_CONNECTION_NAME,\n", + " \"pg8000\",\n", + " user=DB_USER,\n", + " password=DB_PASS,\n", + " db=DB_NAME,\n", + " ip_type=IPTypes.PRIVATE\n", + " )\n", + " return conn\n", + "\n", + "# create connection pool with 'creator' argument to our connection object function\n", + "pool = sqlalchemy.create_engine(\n", + " \"postgresql+pg8000://\",\n", + " creator=getconn,\n", + ")\n", "\n", "Base = declarative_base()\n", "DBSession = scoped_session(sessionmaker())\n", diff --git a/cloudbuild.yaml b/cloudbuild.yaml index d5ea41498..79f554d65 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -13,375 +13,134 @@ # limitations under the License. steps: - - id: "validate platform" - name: "gcr.io/$PROJECT_ID/terraform" - script: | - terraform init -no-color - terraform validate -no-color - dir: "infrastructure/" - waitFor: ["-"] - - - id: "validate ray" - name: "gcr.io/$PROJECT_ID/terraform" - script: | - terraform init -no-color - terraform validate -no-color - dir: "applications/ray/" - waitFor: ["validate platform"] - - - id: "validate jupyterhub" - name: "gcr.io/$PROJECT_ID/terraform" - script: | - terraform init -no-color - terraform validate -no-color - dir: "applications/jupyter/" - waitFor: ["validate platform"] - - - id: "validate rag" - name: "gcr.io/$PROJECT_ID/terraform" - script: | - terraform init -no-color - terraform validate -no-color - dir: "applications/rag/" - waitFor: ["validate platform"] - - # Create cluster to test ray, jupyterhub, rag - # - id: 'create gke cluster' - # name: 'gcr.io/$PROJECT_ID/terraform' - # env: - # - "KUBE_LOAD_CONFIG_FILE=false" - # entrypoint: 'sh' - # args: - # - '-c' - # - | - # set -e - - # terraform apply \ - # -var-file=tfvars_tests/standard-gke-public.platform.tfvars \ - # -var=project_id=$PROJECT_ID \ - # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=subnetwork_region=$_REGION \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \ - # -var=cluster_location=$_REGION \ - # -var='cpu_pools=[{initial_node_count=2,name="cpu-pool",machine_type="n1-standard-16",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-standard",}]' \ - # -var='gpu_pools=[{initial_node_count=2,name="gpu-pool",machine_type="g2-standard-24",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-balanced",accelerator_count=2,accelerator_type="nvidia-l4",gpu_driver_version="DEFAULT",}]' \ - # -auto-approve -no-color - # echo "pass" > /workspace/gke_cluster_result.txt - # dir: 'infrastructure/' - # allowFailure: true - # waitFor: ['validate platform', 'validate ray', 'validate jupyterhub', validate rag] - - # - id: 'test ray cluster' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'sh' - # args: - # - '-c' - # - | - # set -e - - # # Get kube config - # gcloud container clusters get-credentials \ - # ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # --location $_REGION \ - # --project $PROJECT_ID - - # cd /workspace/applications/ray/ - # terraform apply \ - # -var-file=workloads.tfvars \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-ray \ - # -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \ - # -var=enable_gpu=true \ - # -auto-approve -no-color - # echo "pass" > /workspace/user_result.txt - - # chmod +x /workspace/scripts/ci/wait_for_pods.sh - # /workspace/scripts/ci/wait_for_pods.sh ml-$SHORT_SHA-$_BUILD_ID-ray 3000 - - # kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID-ray --for=condition=Ready --timeout=1200s - # # Ray head's readinessProbe is not probing the head service today. Therefore the wait for ready above is not reliable. - # sleep 60s - # kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-ray service/ray-cluster-kuberay-head-svc 8265:8265 & - # # Wait port-forwarding to take its place - # sleep 10s - - # ray job submit \ - # --address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())" - # echo "pass" > /workspace/ray_result.txt - # allowFailure: true - # waitFor: ['create gke cluster'] - - # - id: 'cleanup ray cluster' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # set -e - - # cd /workspace/applications/ray/ - # terraform destroy \ - # -var-file=workloads.tfvars \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-ray \ - # -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \ - # -var=enable_gpu=true \ - # -auto-approve -no-color - # allowFailure: true - # waitFor: ['test ray cluster'] - - # - id: 'test jupyterhub' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # set -e - - # cd /workspace/modules/jupyter/tests - # python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER - - # cd /workspace/applications/jupyter - # terraform apply \ - # -var-file=workloads-without-iap.example.tfvars \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \ - # -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \ - # -auto-approve -no-color - # echo "pass" > /workspace/jupyterhub_tf_result.txt - - # kubectl wait --for=condition=Ready pods -n ml-$SHORT_SHA-$_BUILD_ID-jupyter -l 'component!=continuous-image-puller' --timeout=1800s - # kubectl get services -n ml-$SHORT_SHA-$_BUILD_ID-jupyter - # kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-jupyter service/proxy-public 9442:80 & - # # Wait port-forwarding to take its place - # sleep 5s - - # cd /workspace/modules/jupyter/tests - # python3 test_hub.py "127.0.0.1:9442" $_AUTOPILOT_CLUSTER - # echo "pass" > /workspace/jupyterhub_test_result.txt - # allowFailure: true - # waitFor: ['create gke cluster'] - - # - id: 'cleanup jupyterhub' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # set -e - - # cd /workspace/applications/jupyter/ - # terraform destroy \ - # -var-file=workloads-without-iap.example.tfvars \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \ - # -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \ - # -auto-approve -no-color - # allowFailure: true - # waitFor: ['test jupyterhub'] - - # - id: 'test rag' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'sh' - # secretEnv: ['KAGGLE_USERNAME', 'KAGGLE_KEY'] - # args: - # - '-c' - # - | - # set -e - - # # Get kube config - # gcloud container clusters get-credentials \ - # ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # --location $_REGION \ - # --project $PROJECT_ID - - # cd /workspace/modules/jupyter/tests - # python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER - - # cd /workspace/applications/rag/ - # terraform apply \ - # -var-file=workloads.tfvars \ - # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=create_cluster=false \ - # -var=jupyter_add_auth=false \ - # -var=frontend_add_auth=false \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ - # -var=ray_service_account=ray-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ - # -var=rag_service_account=rag-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ - # -var=jupyter_service_account=jupyter-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ - # -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ - # -auto-approve -no-color - # echo "pass" > /workspace/rag_tf_result.txt - - # # Validate Ray: Make sure pods are running - # kubectl wait --for=condition=Ready pods -n rag-$SHORT_SHA-$_BUILD_ID -l 'component!=continuous-image-puller' --timeout=1200s - # kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/ray-cluster-kuberay-head-svc 8262:8265 & - # # Wait port-forwarding to take its place - # sleep 5s - - # # Validate Ray: Check dashboard - # ray job submit --working-dir ./tests \ - # --address=http://127.0.0.1:8262 -- python -c "import ray; ray.init(); print(ray.cluster_resources())" - # echo "pass" > /workspace/rag_ray_dashboard_result.txt - - # # Validate JupyterHub: Get hub url - # kubectl get services -n rag-$SHORT_SHA-$_BUILD_ID - # kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/proxy-public 9443:80 & - # # Wait port-forwarding to take its place - # sleep 5s - - # # Validate JupyterHub: Test Hub - # cd /workspace/modules/jupyter/tests - # python3 test_hub.py "127.0.0.1:9443" $_AUTOPILOT_CLUSTER - # echo "pass" > /workspace/rag_jupyterhub_test_result.txt - - # # Validate RAG: Test rag frontend - # kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/rag-frontend 8081:8080 & - # # Wait port-forwarding to take its place - # sleep 5s - - # cd /workspace/applications/rag/tests - # python3 test_frontend.py "127.0.0.1:8081" - # echo "pass" > /workspace/rag_frontend_result.txt - - # cd /workspace/ - # sed -i "s//$$KAGGLE_USERNAME/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb - # sed -i "s//$$KAGGLE_KEY/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb - # gsutil cp ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb gs://gke-aieco-rag-$SHORT_SHA-$_BUILD_ID/ - # kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID $(kubectl get pod -l app=jupyterhub,component=hub -n rag-$SHORT_SHA-$_BUILD_ID -o jsonpath="{.items[0].metadata.name}") -- jupyterhub token admin --log-level=CRITICAL | xargs python3 ./applications/rag/notebook_starter.py - # # Wait for jupyterhub to trigger notebook pod startup - # sleep 5s - # kubectl wait --for=condition=Ready pod/jupyter-admin -n rag-$SHORT_SHA-$_BUILD_ID --timeout=500s - # kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- jupyter nbconvert --to script /data/rag-kaggle-ray-sql-interactive.ipynb - # kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- ipython /data/rag-kaggle-ray-sql-interactive.py - - # python3 ./applications/rag/tests/test_rag.py "http://127.0.0.1:8081/prompt" - # echo "pass" > /workspace/rag_prompt_result.txt - - # allowFailure: true - # waitFor: ['create gke cluster'] - - # - id: 'cleanup rag' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # set -e - - # cd /workspace/applications/rag/ - # terraform destroy \ - # -var-file=workloads.tfvars \ - # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=create_cluster=false \ - # -var=jupyter_add_auth=false \ - # -var=frontend_add_auth=false \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ - # -var=ray_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=rag_service_account=rag-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=jupyter_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ - # -auto-approve -no-color - # allowFailure: true - # waitFor: ['test rag'] - - # - id: 'cleanup gke cluster' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # set -e - - # cd /workspace/infrastructure - # terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \ - # -var=cluster_location=$_REGION -auto-approve -no-color - - # allowFailure: true - # waitFor: ['cleanup rag', 'cleanup jupyterhub', 'cleanup ray cluster'] - - # - id: 'check result' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # if [[ $(cat /workspace/gke_cluster_result.txt) != "pass" ]]; then - # echo "gke cluster creation failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/ray_result.txt) != "pass" ]]; then - # echo "ray API run failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/user_result.txt) != "pass" ]]; then - # echo "ray cluster failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/jupyterhub_tf_result.txt) != "pass" ]]; then - # echo "jupyterhub tf failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/jupyterhub_test_result.txt) != "pass" ]]; then - # echo "jupyterhub test failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/rag_tf_result.txt) != "pass" ]]; then - # echo "rag tf failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/rag_ray_dashboard_result.txt) != "pass" ]]; then - # echo "rag ray dashboard test failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/rag_jupyterhub_test_result.txt) != "pass" ]]; then - # echo "rag jupyterhub test failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/rag_frontend_result.txt) != "pass" ]]; then - # echo "rag frontend test failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/rag_prompt_result.txt) != "pass" ]]; then - # echo "rag prompt test failed" - # exit 1 - # fi - # waitFor: ['cleanup gke cluster'] +- id: "validate platform" + name: "gcr.io/$PROJECT_ID/terraform" + script: | + terraform init -no-color + terraform validate -no-color + dir: "infrastructure/" + waitFor: ["-"] + +- id: "validate rag" + name: "gcr.io/$PROJECT_ID/terraform" + script: | + terraform init -no-color + terraform validate -no-color + dir: "applications/rag/" + waitFor: ["validate platform"] + +# Create cluster to test ray, jupyterhub, rag +- id: 'create gke cluster' + name: 'gcr.io/$PROJECT_ID/terraform' + env: + - "KUBE_LOAD_CONFIG_FILE=false" + entrypoint: 'sh' + args: + - '-c' + - | + set -e + + terraform apply \ + -var-file=tfvars_tests/standard-gke-public.platform.tfvars \ + -var=project_id=$PROJECT_ID \ + -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + -var=subnetwork_region=$_REGION \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \ + -var=cluster_location=$_REGION \ + -var='cpu_pools=[{initial_node_count=2,name="cpu-pool",machine_type="n1-standard-16",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-standard",}]' \ + -var='gpu_pools=[{initial_node_count=2,name="gpu-pool",machine_type="g2-standard-24",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-balanced",accelerator_count=2,accelerator_type="nvidia-l4",gpu_driver_version="DEFAULT",}]' \ + -auto-approve -no-color + echo "pass" > /workspace/gke_cluster_result.txt + dir: 'infrastructure/' + allowFailure: true + waitFor: ['validate platform', 'validate rag'] + +- id: 'test rag' + name: 'gcr.io/$PROJECT_ID/terraform' + entrypoint: 'sh' + secretEnv: ['KAGGLE_USERNAME', 'KAGGLE_KEY'] + args: + - '-c' + - | + set -e + + # Get kube config + gcloud container clusters get-credentials \ + ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + --location $_REGION \ + --project $PROJECT_ID + + cd /workspace/modules/jupyter/tests + python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER + + cd /workspace/applications/rag/ + terraform apply \ + -var-file=workloads.tfvars \ + -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + -var=create_cluster=false \ + -var=jupyter_add_auth=false \ + -var=frontend_add_auth=false \ + -var=project_id=$PROJECT_ID \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=cluster_location=$_REGION \ + -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ + -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ + -var=ray_service_account=ray-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ + -var=rag_service_account=rag-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ + -var=jupyter_service_account=jupyter-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ + -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ + -auto-approve -no-color + echo "pass" > /workspace/rag_tf_result.txt + + # Validate Ray: Make sure pods are running + kubectl wait --for=condition=Ready pods -n rag-$SHORT_SHA-$_BUILD_ID -l 'component!=continuous-image-puller' --timeout=1200s + kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/ray-cluster-kuberay-head-svc 8262:8265 & + # Wait port-forwarding to take its place + sleep 5s + + # Validate Ray: Check dashboard + ray job submit --working-dir ./tests \ + --address=http://127.0.0.1:8262 -- python -c "import ray; ray.init(); print(ray.cluster_resources())" + echo "pass" > /workspace/rag_ray_dashboard_result.txt + + # Validate JupyterHub: Get hub url + kubectl get services -n rag-$SHORT_SHA-$_BUILD_ID + kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/proxy-public 9443:80 & + # Wait port-forwarding to take its place + sleep 5s + + # Validate JupyterHub: Test Hub + cd /workspace/modules/jupyter/tests + python3 test_hub.py "127.0.0.1:9443" $_AUTOPILOT_CLUSTER + echo "pass" > /workspace/rag_jupyterhub_test_result.txt + + # Validate RAG: Test rag frontend + kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/rag-frontend 8081:8080 & + # Wait port-forwarding to take its place + sleep 5s + + cd /workspace/applications/rag/tests + python3 test_frontend.py "127.0.0.1:8081" + echo "pass" > /workspace/rag_frontend_result.txt + + cd /workspace/ + sed -i "s//$$KAGGLE_USERNAME/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb + sed -i "s//$$KAGGLE_KEY/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb + gsutil cp ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb gs://gke-aieco-rag-$SHORT_SHA-$_BUILD_ID/ + kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID $(kubectl get pod -l app=jupyterhub,component=hub -n rag-$SHORT_SHA-$_BUILD_ID -o jsonpath="{.items[0].metadata.name}") -- jupyterhub token admin --log-level=CRITICAL | xargs python3 ./applications/rag/notebook_starter.py + # Wait for jupyterhub to trigger notebook pod startup + sleep 5s + kubectl wait --for=condition=Ready pod/jupyter-admin -n rag-$SHORT_SHA-$_BUILD_ID --timeout=500s + kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- jupyter nbconvert --to script /data/rag-kaggle-ray-sql-interactive.ipynb + kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- ipython /data/rag-kaggle-ray-sql-interactive.py + + python3 ./applications/rag/tests/test_rag.py "http://127.0.0.1:8081/prompt" + echo "pass" > /workspace/rag_prompt_result.txt + + allowFailure: true + waitFor: ['create gke cluster'] substitutions: _REGION: us-east4 @@ -393,9 +152,9 @@ options: substitutionOption: "ALLOW_LOOSE" machineType: "E2_HIGHCPU_8" timeout: 5400s -# availableSecrets: -# secretManager: -# - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-username/versions/latest -# env: "KAGGLE_USERNAME" -# - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-key/versions/latest -# env: "KAGGLE_KEY" +availableSecrets: + secretManager: + - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-username/versions/latest + env: "KAGGLE_USERNAME" + - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-key/versions/latest + env: "KAGGLE_KEY"