Skip to content

Commit

Permalink
CI for AutoPilot clusters (#315)
Browse files Browse the repository at this point in the history
* added cloudbuild.yaml to test on AP clusters

* debug jupyter test

* parameterize scripts for autopilot

* refactor cloudbuild file to use cluster_type

* fix jupyter test script args

* single cloudbuild for std & ap clusters

* replace suffix with build_id

* fix namespace_selector for gke-gmp-system
  • Loading branch information
hsachdevah committed Mar 18, 2024
1 parent 5b347a6 commit f8c0453
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 79 deletions.
135 changes: 59 additions & 76 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,19 @@ steps:
- '-c'
- |
set -e
terraform apply \
-var-file=tfvars_tests/standard-gke-public.platform.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-var=cluster_location=$_REGION \
-auto-approve -no-color -lock=false
echo "pass" > /workspace/gke_cluster_result.txt
dir: 'infrastructure/'
allowFailure: true
waitFor: ['validate platform', 'validate ray', 'validate jupyterhub', validate rag]

- id: 'test ray cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'sh'
Expand All @@ -76,25 +78,25 @@ steps:
# Get kube config
gcloud container clusters get-credentials \
ml-$SHORT_SHA-$_PR_NUMBER-cluster \
ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
--location $_REGION \
--project $PROJECT_ID
cd /workspace/applications/ray/
terraform apply \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=cluster_location=$_REGION \
-var=kubernetes_namespace=ml-$SHORT_SHA \
-var=workload_identity_service_account=ray-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA \
-var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
-var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color -lock=false
echo "pass" > /workspace/user_result.txt
# Make sure pods are running
kubectl wait --all pods -n ml-$SHORT_SHA --for=condition=Ready --timeout=300s
kubectl port-forward -n ml-$SHORT_SHA service/example-cluster-kuberay-head-svc 8265:8265 &
kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s
kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID service/example-cluster-kuberay-head-svc 8265:8265 &
# Wait port-forwarding to take its place
sleep 5s
Expand All @@ -116,12 +118,12 @@ steps:
terraform destroy \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=cluster_location=$_REGION \
-var=kubernetes_namespace=ml-$SHORT_SHA \
-var=workload_identity_service_account=ray-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA \
-auto-approve -no-color
-var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
-var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color -lock=false
allowFailure: true
waitFor: ['test ray cluster']
Expand All @@ -135,29 +137,28 @@ steps:
set -e
cd /workspace/modules/jupyter/tests
python3 change_jupyter_config.py
python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER
cd /workspace/applications/jupyter
terraform apply \
-var-file=workloads-without-iap.example.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=kubernetes_namespace=ml-$SHORT_SHA \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color -lock=false
echo "pass" > /workspace/jupyterhub_tf_result.txt
kubectl wait --all pods -n ml-$SHORT_SHA --for=condition=Ready --timeout=300s
kubectl get services -n ml-$SHORT_SHA
kubectl get service proxy-public -n ml-$SHORT_SHA --output jsonpath='{.status.loadBalancer.ingress[0].ip}' > /workspace/jupyterhub_host_url.txt
kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s
kubectl get services -n ml-$SHORT_SHA-$_BUILD_ID
kubectl get service proxy-public -n ml-$SHORT_SHA-$_BUILD_ID --output jsonpath='{.status.loadBalancer.ingress[0].ip}' > /workspace/jupyterhub_host_url.txt
echo "HOST URL is " $(cat /workspace/jupyterhub_host_url.txt)
cd /workspace/modules/jupyter/tests
python3 test_hub.py $(cat /workspace/jupyterhub_host_url.txt)
python3 test_hub.py $(cat /workspace/jupyterhub_host_url.txt) $_AUTOPILOT_CLUSTER
echo "pass" > /workspace/jupyterhub_test_result.txt
allowFailure: true
# waitFor: ['cleanup ray cluster']

- id: 'cleanup jupyterhub'
name: 'gcr.io/$PROJECT_ID/terraform'
Expand All @@ -171,11 +172,11 @@ steps:
terraform destroy \
-var-file=workloads-without-iap.example.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=kubernetes_namespace=ml-$SHORT_SHA \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA \
-auto-approve -no-color
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color -lock=false
allowFailure: true
waitFor: ['test jupyterhub']
Expand All @@ -190,32 +191,32 @@ steps:
# Get kube config
gcloud container clusters get-credentials \
ml-$SHORT_SHA-$_PR_NUMBER-cluster \
ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
--location $_REGION \
--project $PROJECT_ID
cd /workspace/modules/jupyter/tests
python3 change_jupyter_config.py
python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER
cd /workspace/applications/rag/
terraform apply \
-var-file=workloads.tfvars \
-var=jupyter_add_auth=false \
-var=frontend_add_auth=false \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=kubernetes_namespace=rag-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-rag-$SHORT_SHA \
-var=ray_service_account=ray-sa-$SHORT_SHA \
-var=rag_service_account=rag-sa-$SHORT_SHA \
-var=jupyter_service_account=jupyter-sa-$SHORT_SHA \
-var=cloudsql_instance=pgvector-instance-$SHORT_SHA \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \
-var=ray_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
-var=rag_service_account=rag-sa-$SHORT_SHA-$_BUILD_ID \
-var=jupyter_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color -lock=false
echo "pass" > /workspace/rag_tf_result.txt
# Validate Ray: Make sure pods are running
kubectl wait --all pods -n rag-$SHORT_SHA --for=condition=Ready --timeout=300s
kubectl port-forward -n rag-$SHORT_SHA service/example-cluster-kuberay-head-svc 8265:8265 &
kubectl wait --all pods -n rag-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s
kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/example-cluster-kuberay-head-svc 8265:8265 &
# Wait port-forwarding to take its place
sleep 5s
Expand All @@ -225,24 +226,25 @@ steps:
echo "pass" > /workspace/rag_ray_dashboard_result.txt
# Validate Jupyterhub: Get hub url
kubectl get services -n rag-$SHORT_SHA
kubectl get service proxy-public -n rag-$SHORT_SHA --output jsonpath='{.status.loadBalancer.ingress[0].ip}' > /workspace/rag_jupyterhub_host_url.txt
kubectl get services -n rag-$SHORT_SHA-$_BUILD_ID
kubectl get service proxy-public -n rag-$SHORT_SHA-$_BUILD_ID --output jsonpath='{.status.loadBalancer.ingress[0].ip}' > /workspace/rag_jupyterhub_host_url.txt
echo "HOST URL is " $(cat /workspace/rag_jupyterhub_host_url.txt)
# Validate Jupyterhub: Test Hub
cd /workspace/modules/jupyter/tests
python3 test_hub.py $(cat /workspace/rag_jupyterhub_host_url.txt)
python3 test_hub.py $(cat /workspace/rag_jupyterhub_host_url.txt) $_AUTOPILOT_CLUSTER
echo "pass" > /workspace/rag_jupyterhub_test_result.txt
# Validate RAG: Test rag frontend
kubectl port-forward -n rag-$SHORT_SHA service/rag-frontend 8081:8080 &
kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/rag-frontend 8081:8080 &
# Wait port-forwarding to take its place
sleep 5s
cd /workspace/applications/rag/tests
python3 test_frontend.py "127.0.0.1:8081"
echo "pass" > /workspace/rag_frontend_result.txt
allowFailure: true
waitFor: ['cleanup jupyterhub', 'cleanup ray cluster']

- id: 'cleanup rag'
name: 'gcr.io/$PROJECT_ID/terraform'
Expand All @@ -252,38 +254,17 @@ steps:
- |
set -e
cd /workspace/applications/jupyter/
terraform destroy \
-var-file=workloads-without-iap.example.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=kubernetes_namespace=ml-$SHORT_SHA \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA \
-auto-approve -no-color
cd /workspace/applications/ray/
terraform destroy \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_location=$_REGION \
-var=kubernetes_namespace=ml-$SHORT_SHA \
-var=workload_identity_service_account=ray-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA \
-auto-approve -no-color
cd /workspace/applications/rag/
terraform destroy \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=kubernetes_namespace=rag-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-rag-$SHORT_SHA \
-var=ray_service_account=ray-sa-$SHORT_SHA \
-var=rag_service_account=rag-sa-$SHORT_SHA \
-var=jupyter_service_account=jupyter-sa-$SHORT_SHA \
-var=cloudsql_instance=pgvector-instance-$SHORT_SHA \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \
-var=ray_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
-var=rag_service_account=rag-sa-$SHORT_SHA-$_BUILD_ID \
-var=jupyter_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color
allowFailure: true
Expand All @@ -299,7 +280,8 @@ steps:
cd /workspace/infrastructure
terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-var=cluster_location=$_REGION -auto-approve -no-color
allowFailure: true
Expand All @@ -315,7 +297,7 @@ steps:
echo "gke cluster creation failed"
exit 1
fi
if [[ $(cat /workspace/ray_result.txt) != "pass" ]]; then
echo "ray API run failed"
exit 1
Expand Down Expand Up @@ -361,8 +343,9 @@ steps:
substitutions:
_REGION: us-central1
_USER_NAME: github
_AUTOPILOT_CLUSTER: "false"
_BUILD_ID: ${BUILD_ID:0:8}
options:
substitutionOption: 'ALLOW_LOOSE'
machineType: 'E2_HIGHCPU_8'
## increase timeout to make sure job completes, else will lead to residual resources
timeout: 5400s
timeout: 5400s
11 changes: 9 additions & 2 deletions modules/jupyter/tests/change_jupyter_config.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
import yaml
import sys

with open("../jupyter_config/config-selfauth.yaml", "r") as yaml_file:
config_file = "../jupyter_config/config-selfauth.yaml"
if len(sys.argv) == 2:
autopilot = (sys.argv[1] == "true")
if autopilot:
config_file = "../jupyter_config/config-selfauth-autopilot.yaml"

with open(config_file, "r") as yaml_file:
data = yaml.safe_load(yaml_file)

data["hub"]["config"]["DummyAuthenticator"]["password"] = "dummy"

with open("../jupyter_config/config-selfauth.yaml", 'w') as yaml_file:
with open(config_file, 'w') as yaml_file:
yaml.dump(data, yaml_file)
8 changes: 7 additions & 1 deletion modules/jupyter/tests/test_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@

from packaging.version import Version as V

config_file = "../jupyter_config/config-selfauth.yaml"
if len(sys.argv) == 3:
autopilot = (sys.argv[2] == "true")
if autopilot:
config_file = "../jupyter_config/config-selfauth-autopilot.yaml"


def test_hub_up(hub_url):
r = requests.get(hub_url)
Expand Down Expand Up @@ -32,7 +38,7 @@ def test_hub_login(hub_url):
from /jupyter_config/config.yaml. After successfully login, user will be
redirected to /hub/spawn.
"""
with open("../jupyter_config/config-selfauth.yaml", "r") as yaml_file:
with open(config_file, "r") as yaml_file:
data = yaml.safe_load(yaml_file)

username = data["hub"]["config"]["Authenticator"]["admin_users"][0]
Expand Down
1 change: 1 addition & 0 deletions modules/kuberay-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ resource "kubernetes_network_policy" "kuberay-cluster-allow-network-policy" {
namespace_selector {
match_labels = {
"kubernetes.io/metadata.name" = var.namespace
"kubernetes.io/metadata.name" = "gke-gmp-system"
}
}
}
Expand Down

0 comments on commit f8c0453

Please sign in to comment.