Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CI for AutoPilot clusters #315

Merged
merged 13 commits into from
Mar 18, 2024
135 changes: 59 additions & 76 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,19 @@ steps:
- '-c'
- |
set -e

terraform apply \
-var-file=tfvars_tests/standard-gke-public.platform.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-var=cluster_location=$_REGION \
-auto-approve -no-color -lock=false
echo "pass" > /workspace/gke_cluster_result.txt
dir: 'infrastructure/'
allowFailure: true
waitFor: ['validate platform', 'validate ray', 'validate jupyterhub', validate rag]

- id: 'test ray cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'sh'
Expand All @@ -76,25 +78,25 @@ steps:

# Get kube config
gcloud container clusters get-credentials \
ml-$SHORT_SHA-$_PR_NUMBER-cluster \
ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
--location $_REGION \
--project $PROJECT_ID

cd /workspace/applications/ray/
terraform apply \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=cluster_location=$_REGION \
-var=kubernetes_namespace=ml-$SHORT_SHA \
-var=workload_identity_service_account=ray-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA \
-var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
-var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color -lock=false
echo "pass" > /workspace/user_result.txt

# Make sure pods are running
kubectl wait --all pods -n ml-$SHORT_SHA --for=condition=Ready --timeout=300s
kubectl port-forward -n ml-$SHORT_SHA service/example-cluster-kuberay-head-svc 8265:8265 &
kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s
kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID service/example-cluster-kuberay-head-svc 8265:8265 &
# Wait port-forwarding to take its place
sleep 5s

Expand All @@ -116,12 +118,12 @@ steps:
terraform destroy \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=cluster_location=$_REGION \
-var=kubernetes_namespace=ml-$SHORT_SHA \
-var=workload_identity_service_account=ray-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA \
-auto-approve -no-color
-var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
-var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color -lock=false

allowFailure: true
waitFor: ['test ray cluster']
Expand All @@ -135,29 +137,28 @@ steps:
set -e

cd /workspace/modules/jupyter/tests
python3 change_jupyter_config.py
python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER

cd /workspace/applications/jupyter
terraform apply \
-var-file=workloads-without-iap.example.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=kubernetes_namespace=ml-$SHORT_SHA \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color -lock=false
echo "pass" > /workspace/jupyterhub_tf_result.txt

kubectl wait --all pods -n ml-$SHORT_SHA --for=condition=Ready --timeout=300s
kubectl get services -n ml-$SHORT_SHA
kubectl get service proxy-public -n ml-$SHORT_SHA --output jsonpath='{.status.loadBalancer.ingress[0].ip}' > /workspace/jupyterhub_host_url.txt
kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s
kubectl get services -n ml-$SHORT_SHA-$_BUILD_ID
kubectl get service proxy-public -n ml-$SHORT_SHA-$_BUILD_ID --output jsonpath='{.status.loadBalancer.ingress[0].ip}' > /workspace/jupyterhub_host_url.txt
echo "HOST URL is " $(cat /workspace/jupyterhub_host_url.txt)

cd /workspace/modules/jupyter/tests
python3 test_hub.py $(cat /workspace/jupyterhub_host_url.txt)
python3 test_hub.py $(cat /workspace/jupyterhub_host_url.txt) $_AUTOPILOT_CLUSTER
echo "pass" > /workspace/jupyterhub_test_result.txt
allowFailure: true
# waitFor: ['cleanup ray cluster']

- id: 'cleanup jupyterhub'
name: 'gcr.io/$PROJECT_ID/terraform'
Expand All @@ -171,11 +172,11 @@ steps:
terraform destroy \
-var-file=workloads-without-iap.example.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=kubernetes_namespace=ml-$SHORT_SHA \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA \
-auto-approve -no-color
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color -lock=false

allowFailure: true
waitFor: ['test jupyterhub']
Expand All @@ -190,32 +191,32 @@ steps:

# Get kube config
gcloud container clusters get-credentials \
ml-$SHORT_SHA-$_PR_NUMBER-cluster \
ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
--location $_REGION \
--project $PROJECT_ID

cd /workspace/modules/jupyter/tests
python3 change_jupyter_config.py
python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER

cd /workspace/applications/rag/
terraform apply \
-var-file=workloads.tfvars \
-var=jupyter_add_auth=false \
-var=frontend_add_auth=false \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=kubernetes_namespace=rag-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-rag-$SHORT_SHA \
-var=ray_service_account=ray-sa-$SHORT_SHA \
-var=rag_service_account=rag-sa-$SHORT_SHA \
-var=jupyter_service_account=jupyter-sa-$SHORT_SHA \
-var=cloudsql_instance=pgvector-instance-$SHORT_SHA \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \
-var=ray_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
-var=rag_service_account=rag-sa-$SHORT_SHA-$_BUILD_ID \
-var=jupyter_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color -lock=false
echo "pass" > /workspace/rag_tf_result.txt

# Validate Ray: Make sure pods are running
kubectl wait --all pods -n rag-$SHORT_SHA --for=condition=Ready --timeout=300s
kubectl port-forward -n rag-$SHORT_SHA service/example-cluster-kuberay-head-svc 8265:8265 &
kubectl wait --all pods -n rag-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s
kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/example-cluster-kuberay-head-svc 8265:8265 &
# Wait port-forwarding to take its place
sleep 5s

Expand All @@ -225,24 +226,25 @@ steps:
echo "pass" > /workspace/rag_ray_dashboard_result.txt

# Validate Jupyterhub: Get hub url
kubectl get services -n rag-$SHORT_SHA
kubectl get service proxy-public -n rag-$SHORT_SHA --output jsonpath='{.status.loadBalancer.ingress[0].ip}' > /workspace/rag_jupyterhub_host_url.txt
kubectl get services -n rag-$SHORT_SHA-$_BUILD_ID
kubectl get service proxy-public -n rag-$SHORT_SHA-$_BUILD_ID --output jsonpath='{.status.loadBalancer.ingress[0].ip}' > /workspace/rag_jupyterhub_host_url.txt
echo "HOST URL is " $(cat /workspace/rag_jupyterhub_host_url.txt)

# Validate Jupyterhub: Test Hub
cd /workspace/modules/jupyter/tests
python3 test_hub.py $(cat /workspace/rag_jupyterhub_host_url.txt)
python3 test_hub.py $(cat /workspace/rag_jupyterhub_host_url.txt) $_AUTOPILOT_CLUSTER
echo "pass" > /workspace/rag_jupyterhub_test_result.txt

# Validate RAG: Test rag frontend
kubectl port-forward -n rag-$SHORT_SHA service/rag-frontend 8081:8080 &
kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/rag-frontend 8081:8080 &
# Wait port-forwarding to take its place
sleep 5s

cd /workspace/applications/rag/tests
python3 test_frontend.py "127.0.0.1:8081"
echo "pass" > /workspace/rag_frontend_result.txt
allowFailure: true
waitFor: ['cleanup jupyterhub', 'cleanup ray cluster']

- id: 'cleanup rag'
name: 'gcr.io/$PROJECT_ID/terraform'
Expand All @@ -252,38 +254,17 @@ steps:
- |
set -e

cd /workspace/applications/jupyter/
terraform destroy \
-var-file=workloads-without-iap.example.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=kubernetes_namespace=ml-$SHORT_SHA \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA \
-auto-approve -no-color

cd /workspace/applications/ray/
terraform destroy \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_location=$_REGION \
-var=kubernetes_namespace=ml-$SHORT_SHA \
-var=workload_identity_service_account=ray-sa-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA \
-auto-approve -no-color

cd /workspace/applications/rag/
terraform destroy \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=kubernetes_namespace=rag-$SHORT_SHA \
-var=gcs_bucket=gke-aieco-rag-$SHORT_SHA \
-var=ray_service_account=ray-sa-$SHORT_SHA \
-var=rag_service_account=rag-sa-$SHORT_SHA \
-var=jupyter_service_account=jupyter-sa-$SHORT_SHA \
-var=cloudsql_instance=pgvector-instance-$SHORT_SHA \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \
-var=ray_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
-var=rag_service_account=rag-sa-$SHORT_SHA-$_BUILD_ID \
-var=jupyter_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color

allowFailure: true
Expand All @@ -299,7 +280,8 @@ steps:

cd /workspace/infrastructure
terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-var=cluster_location=$_REGION -auto-approve -no-color

allowFailure: true
Expand All @@ -315,7 +297,7 @@ steps:
echo "gke cluster creation failed"
exit 1
fi

if [[ $(cat /workspace/ray_result.txt) != "pass" ]]; then
echo "ray API run failed"
exit 1
Expand Down Expand Up @@ -361,8 +343,9 @@ steps:
substitutions:
_REGION: us-central1
_USER_NAME: github
_AUTOPILOT_CLUSTER: "false"
_BUILD_ID: ${BUILD_ID:0:8}
options:
substitutionOption: 'ALLOW_LOOSE'
machineType: 'E2_HIGHCPU_8'
## increase timeout to make sure job completes, else will lead to residual resources
timeout: 5400s
timeout: 5400s
11 changes: 9 additions & 2 deletions modules/jupyter/tests/change_jupyter_config.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
import yaml
import sys

with open("../jupyter_config/config-selfauth.yaml", "r") as yaml_file:
config_file = "../jupyter_config/config-selfauth.yaml"
if len(sys.argv) == 2:
autopilot = (sys.argv[1] == "true")
if autopilot:
config_file = "../jupyter_config/config-selfauth-autopilot.yaml"
hsachdevah marked this conversation as resolved.
Show resolved Hide resolved

with open(config_file, "r") as yaml_file:
data = yaml.safe_load(yaml_file)

data["hub"]["config"]["DummyAuthenticator"]["password"] = "dummy"

with open("../jupyter_config/config-selfauth.yaml", 'w') as yaml_file:
with open(config_file, 'w') as yaml_file:
yaml.dump(data, yaml_file)
8 changes: 7 additions & 1 deletion modules/jupyter/tests/test_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@

from packaging.version import Version as V

config_file = "../jupyter_config/config-selfauth.yaml"
hsachdevah marked this conversation as resolved.
Show resolved Hide resolved
if len(sys.argv) == 3:
autopilot = (sys.argv[2] == "true")
if autopilot:
config_file = "../jupyter_config/config-selfauth-autopilot.yaml"


def test_hub_up(hub_url):
r = requests.get(hub_url)
Expand Down Expand Up @@ -32,7 +38,7 @@ def test_hub_login(hub_url):
from /jupyter_config/config.yaml. After successfully login, user will be
redirected to /hub/spawn.
"""
with open("../jupyter_config/config-selfauth.yaml", "r") as yaml_file:
with open(config_file, "r") as yaml_file:
data = yaml.safe_load(yaml_file)

username = data["hub"]["config"]["Authenticator"]["admin_users"][0]
Expand Down
1 change: 1 addition & 0 deletions modules/kuberay-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ resource "kubernetes_network_policy" "kuberay-cluster-allow-network-policy" {
namespace_selector {
match_labels = {
"kubernetes.io/metadata.name" = var.namespace
"kubernetes.io/metadata.name" = "gke-gmp-system"
}
}
}
Expand Down
Loading