Skip to content

Commit

Permalink
Add cloudbuild tests for rag application (#261)
Browse files Browse the repository at this point in the history
* test rag application setup

* update rag tests

* fix check result step

* fix as per review comments

* fix variables for applications/jupyter

* fix bugs for jupyter & rag app

* CI fixes & revert jupterhub module changes

* correcting merge conflict miss

* fix system account SA names

* add SHA suffix to cloudsql instance
  • Loading branch information
hsachdevah authored Mar 6, 2024
1 parent 620167f commit 39db886
Show file tree
Hide file tree
Showing 7 changed files with 234 additions and 21 deletions.
8 changes: 4 additions & 4 deletions applications/jupyter/workloads-without-iap.example.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ cluster_membership_id = "" # required only for private clusters, default: cluste
#######################################################

## JupyterHub variables
namespace = "jupyter"
gcs_bucket = "<gcs-bucket>"
create_gcs_bucket = true
workload_identity_service_account = "jupyter-service-account"
namespace = "jupyter"
gcs_bucket = "<gcs-bucket>"
create_gcs_bucket = true
workload_identity_service_account = "jupyter-service-account"

# Jupyterhub without IAP
add_auth = false
1 change: 1 addition & 0 deletions applications/rag/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ module "frontend" {
google_service_account = var.rag_service_account
namespace = var.kubernetes_namespace
inference_service_endpoint = module.inference-server.inference_service_endpoint
cloudsql_instance = var.cloudsql_instance
db_secret_name = module.cloudsql.db_secret_name
db_secret_namespace = module.cloudsql.db_secret_namespace
dataset_embeddings_table_name = var.dataset_embeddings_table_name
Expand Down
11 changes: 11 additions & 0 deletions applications/rag/tests/test_frontend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import sys
import requests

def test_frontend_up(rag_frontend_url):
r = requests.get(rag_frontend_url)
r.raise_for_status()
print("Rag frontend is up.")

hub_url = "http://" + sys.argv[1]

test_frontend_up(hub_url)
2 changes: 1 addition & 1 deletion applications/rag/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ variable "jupyter_k8s_backend_service_name" {

variable "jupyter_k8s_backend_service_port" {
type = number
description = "NName of the Backend Service Port"
description = "Name of the Backend Service Port"
default = 80
}

Expand Down
196 changes: 182 additions & 14 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Google LLC
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -27,16 +27,25 @@ steps:
terraform init -no-color
terraform validate -no-color
dir: 'applications/ray/'
waitFor: ['-']
waitFor: ['validate platform']

- id: 'validate jupyterhub'
name: 'gcr.io/$PROJECT_ID/terraform'
script: |
terraform init -no-color
terraform validate -no-color
dir: 'applications/jupyter/'
waitFor: ['-']
waitFor: ['validate platform']

- id: 'validate rag'
name: 'gcr.io/$PROJECT_ID/terraform'
script: |
terraform init -no-color
terraform validate -no-color
dir: 'applications/rag/'
waitFor: ['validate platform']

# Create cluster to test ray, jupyterhub
- id: 'create gke cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
env:
Expand All @@ -46,13 +55,16 @@ steps:
- '-c'
- |
set -e
terraform apply -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \
terraform apply \
-var-file=tfvars_tests/standard-gke-public.platform.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_region=$_REGION -auto-approve -no-color
-var=cluster_region=$_REGION \
-auto-approve -no-color -lock=false
echo "pass" > /workspace/gke_cluster_result.txt
dir: 'infrastructure/'
allowFailure: true
waitFor: ['validate platform', 'validate ray', 'validate jupyterhub']
waitFor: ['validate platform', 'validate ray', 'validate jupyterhub', validate rag]

- id: 'test ray cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
Expand All @@ -68,7 +80,7 @@ steps:
--location $_REGION \
--project $PROJECT_ID
cd applications/ray/
cd /workspace/applications/ray/
terraform apply \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
Expand All @@ -77,7 +89,7 @@ steps:
-var=ray_namespace=ml-$SHORT_SHA \
-var=gcp_service_account=ray-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA \
-auto-approve -no-color
-auto-approve -no-color -lock=false
echo "pass" > /workspace/user_result.txt
# Make sure pods are running
Expand All @@ -87,11 +99,33 @@ steps:
sleep 5s
ray job submit --working-dir ./example_ray_job_scripts \
--address=http://127.0.0.1:8265 -- python ray_job.py
--address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"
echo "pass" > /workspace/ray_result.txt
allowFailure: true
waitFor: ['create gke cluster']

- id: 'cleanup ray cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
set -e
cd /workspace/applications/ray/
terraform destroy \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_location=$_REGION \
-var=ray_namespace=ml-$SHORT_SHA \
-var=gcp_service_account=ray-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA \
-auto-approve -no-color
allowFailure: true
waitFor: ['test ray cluster']

- id: 'test jupyterhub'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
Expand All @@ -111,7 +145,7 @@ steps:
-var=namespace=ml-$SHORT_SHA \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA \
-auto-approve -no-color
-auto-approve -no-color -lock=false
echo "pass" > /workspace/jupyterhub_tf_result.txt
kubectl wait --all pods -n ml-$SHORT_SHA --for=condition=Ready --timeout=300s
Expand All @@ -123,9 +157,94 @@ steps:
python3 test_hub.py $(cat /workspace/jupyterhub_host_url.txt)
echo "pass" > /workspace/jupyterhub_test_result.txt
allowFailure: true
waitFor: ['test ray cluster']
# waitFor: ['cleanup ray cluster']

- id: 'clean gke cluster'
- id: 'cleanup jupyterhub'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
set -e
cd /workspace/applications/jupyter/
terraform destroy \
-var-file=workloads-without-iap.example.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=namespace=ml-$SHORT_SHA \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA \
-auto-approve -no-color
allowFailure: true
waitFor: ['test jupyterhub']

- id: 'test rag'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'sh'
args:
- '-c'
- |
set -e
# Get kube config
gcloud container clusters get-credentials \
ml-$SHORT_SHA-$_PR_NUMBER-cluster \
--location $_REGION \
--project $PROJECT_ID
cd /workspace/modules/jupyter/tests
python3 change_jupyter_config.py
cd /workspace/applications/rag/
terraform apply \
-var-file=workloads.tfvars \
-var=jupyter_add_auth=false \
-var=frontend_add_auth=false \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=kubernetes_namespace=rag-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-rag-$SHORT_SHA \
-var=ray_service_account=ray-sa-$SHORT_SHA \
-var=rag_service_account=rag-sa-$SHORT_SHA \
-var=jupyter_service_account=jupyter-sa-$SHORT_SHA \
-var=cloudsql_instance=pgvector-instance-$SHORT_SHA \
-auto-approve -no-color -lock=false
echo "pass" > /workspace/rag_tf_result.txt
# Validate Ray: Make sure pods are running
kubectl wait --all pods -n rag-$SHORT_SHA --for=condition=Ready --timeout=300s
kubectl port-forward -n rag-$SHORT_SHA service/example-cluster-kuberay-head-svc 8265:8265 &
# Wait port-forwarding to take its place
sleep 5s
# Validate Ray: Check dashboard
ray job submit --working-dir ./tests \
--address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"
echo "pass" > /workspace/rag_ray_dashboard_result.txt
# Validate Jupyterhub: Get hub url
kubectl get services -n rag-$SHORT_SHA
kubectl get service proxy-public -n rag-$SHORT_SHA --output jsonpath='{.status.loadBalancer.ingress[0].ip}' > /workspace/rag_jupyterhub_host_url.txt
echo "HOST URL is " $(cat /workspace/rag_jupyterhub_host_url.txt)
# Validate Jupyterhub: Test Hub
cd /workspace/modules/jupyter/tests
python3 test_hub.py $(cat /workspace/rag_jupyterhub_host_url.txt)
echo "pass" > /workspace/rag_jupyterhub_test_result.txt
# Validate RAG: Test rag frontend
kubectl port-forward -n rag-$SHORT_SHA service/rag-frontend 8081:8080 &
# Wait port-forwarding to take its place
sleep 5s
cd /workspace/applications/rag/tests
python3 test_frontend.py "127.0.0.1:8081"
echo "pass" > /workspace/rag_frontend_result.txt
allowFailure: true

- id: 'cleanup rag'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
Expand Down Expand Up @@ -154,12 +273,37 @@ steps:
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA \
-auto-approve -no-color
cd /workspace/applications/rag/
terraform destroy \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=kubernetes_namespace=rag-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-rag-$SHORT_SHA \
-var=ray_service_account=ray-sa-$SHORT_SHA \
-var=rag_service_account=rag-sa-$SHORT_SHA \
-var=jupyter_service_account=jupyter-sa-$SHORT_SHA \
-var=cloudsql_instance=pgvector-instance-$SHORT_SHA \
-auto-approve -no-color
allowFailure: true
waitFor: ['test rag']

- id: 'cleanup gke cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
set -e
cd /workspace/infrastructure
terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_region=$_REGION -auto-approve -no-color
allowFailure: true
waitFor: ['test jupyterhub']
waitFor: ['cleanup rag']

- id: 'check result'
name: 'gcr.io/$PROJECT_ID/terraform'
Expand Down Expand Up @@ -191,8 +335,32 @@ steps:
echo "jupyterhub test failed"
exit 1
fi
waitFor: ['clean gke cluster']
if [[ $(cat /workspace/rag_tf_result.txt) != "pass" ]]; then
echo "rag tf failed"
exit 1
fi
if [[ $(cat /workspace/rag_ray_dashboard_result.txt) != "pass" ]]; then
echo "rag ray dashboard test failed"
exit 1
fi
if [[ $(cat /workspace/rag_jupyterhub_test_result.txt) != "pass" ]]; then
echo "rag jupyterhub test failed"
exit 1
fi
if [[ $(cat /workspace/rag_frontend_result.txt) != "pass" ]]; then
echo "rag frontend test failed"
exit 1
fi
waitFor: ['cleanup gke cluster']

substitutions:
_REGION: us-central1
_USER_NAME: github
options:
substitutionOption: 'ALLOW_LOOSE'

33 changes: 33 additions & 0 deletions infrastructure/tfvars_tests/standard-gke-public.platform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,36 @@ cpu_pools = [{
disk_size_gb = 100
disk_type = "pd-standard"
}]

## make sure required gpu quotas are available in the corresponding region
enable_gpu = true
gpu_pools = [{
name = "gpu-pool-t4"
machine_type = "n1-standard-16"
node_locations = "us-central1-b,us-central1-c"
autoscaling = true
min_count = 1
max_count = 3
disk_size_gb = 100
enable_gcfs = true
logging_variant = "DEFAULT"
disk_type = "pd-balanced"
accelerator_count = 2
accelerator_type = "nvidia-tesla-t4"
gpu_driver_version = "LATEST"
},
{
name = "gpu-pool-l4"
machine_type = "g2-standard-24"
node_locations = "us-central1-a"
autoscaling = true
min_count = 2
max_count = 3
accelerator_count = 2
disk_size_gb = 100
enable_gcfs = true
logging_variant = "DEFAULT"
disk_type = "pd-balanced"
accelerator_type = "nvidia-l4"
gpu_driver_version = "LATEST"
}]
4 changes: 2 additions & 2 deletions modules/iap/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@ variable "jupyter_k8s_backend_service_name" {

variable "jupyter_k8s_backend_service_port" {
type = number
description = "NName of the Backend Service Port"
default = 80
description = "Name of the Backend Service Port"
default = 80
}

variable "jupyter_url_domain_addr" {
Expand Down

0 comments on commit 39db886

Please sign in to comment.