From f8c04530c21a259fb2dfd74ffde00360df2a04e3 Mon Sep 17 00:00:00 2001 From: Himanshu Sachdeva Date: Mon, 18 Mar 2024 19:18:13 +0100 Subject: [PATCH] CI for AutoPilot clusters (#315) * added cloudbuild.yaml to test on AP clusters * debug jupyter test * parameterize scripts for autopilot * refactor cloudbuild file to use cluster_type * fix jupyter test script args * single cloudbuild for std & ap clusters * replace suffix with build_id * fix namespace_selector for gke-gmp-system --- cloudbuild.yaml | 135 ++++++++---------- .../jupyter/tests/change_jupyter_config.py | 11 +- modules/jupyter/tests/test_hub.py | 8 +- modules/kuberay-cluster/main.tf | 1 + 4 files changed, 76 insertions(+), 79 deletions(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 790d71d5b..918388b7a 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -55,17 +55,19 @@ steps: - '-c' - | set -e + terraform apply \ -var-file=tfvars_tests/standard-gke-public.platform.tfvars \ -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \ -var=cluster_location=$_REGION \ -auto-approve -no-color -lock=false echo "pass" > /workspace/gke_cluster_result.txt dir: 'infrastructure/' allowFailure: true waitFor: ['validate platform', 'validate ray', 'validate jupyterhub', validate rag] - + - id: 'test ray cluster' name: 'gcr.io/$PROJECT_ID/terraform' entrypoint: 'sh' @@ -76,7 +78,7 @@ steps: # Get kube config gcloud container clusters get-credentials \ - ml-$SHORT_SHA-$_PR_NUMBER-cluster \ + ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ --location $_REGION \ --project $PROJECT_ID @@ -84,17 +86,17 @@ steps: terraform apply \ -var-file=workloads.tfvars \ -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ -var=cluster_location=$_REGION \ - -var=kubernetes_namespace=ml-$SHORT_SHA \ - -var=workload_identity_service_account=ray-sa-$SHORT_SHA \ - -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA \ + -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \ + -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ + -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \ -auto-approve -no-color -lock=false echo "pass" > /workspace/user_result.txt # Make sure pods are running - kubectl wait --all pods -n ml-$SHORT_SHA --for=condition=Ready --timeout=300s - kubectl port-forward -n ml-$SHORT_SHA service/example-cluster-kuberay-head-svc 8265:8265 & + kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s + kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID service/example-cluster-kuberay-head-svc 8265:8265 & # Wait port-forwarding to take its place sleep 5s @@ -116,12 +118,12 @@ steps: terraform destroy \ -var-file=workloads.tfvars \ -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ -var=cluster_location=$_REGION \ - -var=kubernetes_namespace=ml-$SHORT_SHA \ - -var=workload_identity_service_account=ray-sa-$SHORT_SHA \ - -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA \ - -auto-approve -no-color + -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \ + -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ + -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \ + -auto-approve -no-color -lock=false allowFailure: true waitFor: ['test ray cluster'] @@ -135,29 +137,28 @@ steps: set -e cd /workspace/modules/jupyter/tests - python3 change_jupyter_config.py + python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER cd /workspace/applications/jupyter terraform apply \ -var-file=workloads-without-iap.example.tfvars \ -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \ - -var=kubernetes_namespace=ml-$SHORT_SHA \ - -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA \ - -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \ + -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ + -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \ -auto-approve -no-color -lock=false echo "pass" > /workspace/jupyterhub_tf_result.txt - kubectl wait --all pods -n ml-$SHORT_SHA --for=condition=Ready --timeout=300s - kubectl get services -n ml-$SHORT_SHA - kubectl get service proxy-public -n ml-$SHORT_SHA --output jsonpath='{.status.loadBalancer.ingress[0].ip}' > /workspace/jupyterhub_host_url.txt + kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s + kubectl get services -n ml-$SHORT_SHA-$_BUILD_ID + kubectl get service proxy-public -n ml-$SHORT_SHA-$_BUILD_ID --output jsonpath='{.status.loadBalancer.ingress[0].ip}' > /workspace/jupyterhub_host_url.txt echo "HOST URL is " $(cat /workspace/jupyterhub_host_url.txt) cd /workspace/modules/jupyter/tests - python3 test_hub.py $(cat /workspace/jupyterhub_host_url.txt) + python3 test_hub.py $(cat /workspace/jupyterhub_host_url.txt) $_AUTOPILOT_CLUSTER echo "pass" > /workspace/jupyterhub_test_result.txt allowFailure: true - # waitFor: ['cleanup ray cluster'] - id: 'cleanup jupyterhub' name: 'gcr.io/$PROJECT_ID/terraform' @@ -171,11 +172,11 @@ steps: terraform destroy \ -var-file=workloads-without-iap.example.tfvars \ -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \ - -var=kubernetes_namespace=ml-$SHORT_SHA \ - -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA \ - -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA \ - -auto-approve -no-color + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID \ + -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ + -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \ + -auto-approve -no-color -lock=false allowFailure: true waitFor: ['test jupyterhub'] @@ -190,12 +191,12 @@ steps: # Get kube config gcloud container clusters get-credentials \ - ml-$SHORT_SHA-$_PR_NUMBER-cluster \ + ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ --location $_REGION \ --project $PROJECT_ID cd /workspace/modules/jupyter/tests - python3 change_jupyter_config.py + python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER cd /workspace/applications/rag/ terraform apply \ @@ -203,19 +204,19 @@ steps: -var=jupyter_add_auth=false \ -var=frontend_add_auth=false \ -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \ - -var=kubernetes_namespace=rag-$SHORT_SHA \ - -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA \ - -var=ray_service_account=ray-sa-$SHORT_SHA \ - -var=rag_service_account=rag-sa-$SHORT_SHA \ - -var=jupyter_service_account=jupyter-sa-$SHORT_SHA \ - -var=cloudsql_instance=pgvector-instance-$SHORT_SHA \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ + -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ + -var=ray_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ + -var=rag_service_account=rag-sa-$SHORT_SHA-$_BUILD_ID \ + -var=jupyter_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ + -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ -auto-approve -no-color -lock=false echo "pass" > /workspace/rag_tf_result.txt # Validate Ray: Make sure pods are running - kubectl wait --all pods -n rag-$SHORT_SHA --for=condition=Ready --timeout=300s - kubectl port-forward -n rag-$SHORT_SHA service/example-cluster-kuberay-head-svc 8265:8265 & + kubectl wait --all pods -n rag-$SHORT_SHA-$_BUILD_ID --for=condition=Ready --timeout=300s + kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/example-cluster-kuberay-head-svc 8265:8265 & # Wait port-forwarding to take its place sleep 5s @@ -225,17 +226,17 @@ steps: echo "pass" > /workspace/rag_ray_dashboard_result.txt # Validate Jupyterhub: Get hub url - kubectl get services -n rag-$SHORT_SHA - kubectl get service proxy-public -n rag-$SHORT_SHA --output jsonpath='{.status.loadBalancer.ingress[0].ip}' > /workspace/rag_jupyterhub_host_url.txt + kubectl get services -n rag-$SHORT_SHA-$_BUILD_ID + kubectl get service proxy-public -n rag-$SHORT_SHA-$_BUILD_ID --output jsonpath='{.status.loadBalancer.ingress[0].ip}' > /workspace/rag_jupyterhub_host_url.txt echo "HOST URL is " $(cat /workspace/rag_jupyterhub_host_url.txt) # Validate Jupyterhub: Test Hub cd /workspace/modules/jupyter/tests - python3 test_hub.py $(cat /workspace/rag_jupyterhub_host_url.txt) + python3 test_hub.py $(cat /workspace/rag_jupyterhub_host_url.txt) $_AUTOPILOT_CLUSTER echo "pass" > /workspace/rag_jupyterhub_test_result.txt # Validate RAG: Test rag frontend - kubectl port-forward -n rag-$SHORT_SHA service/rag-frontend 8081:8080 & + kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/rag-frontend 8081:8080 & # Wait port-forwarding to take its place sleep 5s @@ -243,6 +244,7 @@ steps: python3 test_frontend.py "127.0.0.1:8081" echo "pass" > /workspace/rag_frontend_result.txt allowFailure: true + waitFor: ['cleanup jupyterhub', 'cleanup ray cluster'] - id: 'cleanup rag' name: 'gcr.io/$PROJECT_ID/terraform' @@ -252,38 +254,17 @@ steps: - | set -e - cd /workspace/applications/jupyter/ - terraform destroy \ - -var-file=workloads-without-iap.example.tfvars \ - -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \ - -var=kubernetes_namespace=ml-$SHORT_SHA \ - -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA \ - -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA \ - -auto-approve -no-color - - cd /workspace/applications/ray/ - terraform destroy \ - -var-file=workloads.tfvars \ - -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \ - -var=cluster_location=$_REGION \ - -var=kubernetes_namespace=ml-$SHORT_SHA \ - -var=workload_identity_service_account=ray-sa-$SHORT_SHA \ - -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA \ - -auto-approve -no-color - cd /workspace/applications/rag/ terraform destroy \ -var-file=workloads.tfvars \ -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \ - -var=kubernetes_namespace=rag-$SHORT_SHA \ - -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA \ - -var=ray_service_account=ray-sa-$SHORT_SHA \ - -var=rag_service_account=rag-sa-$SHORT_SHA \ - -var=jupyter_service_account=jupyter-sa-$SHORT_SHA \ - -var=cloudsql_instance=pgvector-instance-$SHORT_SHA \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ + -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ + -var=ray_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ + -var=rag_service_account=rag-sa-$SHORT_SHA-$_BUILD_ID \ + -var=jupyter_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ + -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ -auto-approve -no-color allowFailure: true @@ -299,7 +280,8 @@ steps: cd /workspace/infrastructure terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \ - -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-cluster \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \ -var=cluster_location=$_REGION -auto-approve -no-color allowFailure: true @@ -315,7 +297,7 @@ steps: echo "gke cluster creation failed" exit 1 fi - + if [[ $(cat /workspace/ray_result.txt) != "pass" ]]; then echo "ray API run failed" exit 1 @@ -361,8 +343,9 @@ steps: substitutions: _REGION: us-central1 _USER_NAME: github + _AUTOPILOT_CLUSTER: "false" + _BUILD_ID: ${BUILD_ID:0:8} options: substitutionOption: 'ALLOW_LOOSE' machineType: 'E2_HIGHCPU_8' -## increase timeout to make sure job completes, else will lead to residual resources -timeout: 5400s +timeout: 5400s \ No newline at end of file diff --git a/modules/jupyter/tests/change_jupyter_config.py b/modules/jupyter/tests/change_jupyter_config.py index cfe50ee0e..f404750b7 100644 --- a/modules/jupyter/tests/change_jupyter_config.py +++ b/modules/jupyter/tests/change_jupyter_config.py @@ -1,9 +1,16 @@ import yaml +import sys -with open("../jupyter_config/config-selfauth.yaml", "r") as yaml_file: +config_file = "../jupyter_config/config-selfauth.yaml" +if len(sys.argv) == 2: + autopilot = (sys.argv[1] == "true") + if autopilot: + config_file = "../jupyter_config/config-selfauth-autopilot.yaml" + +with open(config_file, "r") as yaml_file: data = yaml.safe_load(yaml_file) data["hub"]["config"]["DummyAuthenticator"]["password"] = "dummy" -with open("../jupyter_config/config-selfauth.yaml", 'w') as yaml_file: +with open(config_file, 'w') as yaml_file: yaml.dump(data, yaml_file) diff --git a/modules/jupyter/tests/test_hub.py b/modules/jupyter/tests/test_hub.py index d324aaa1f..4c06daba6 100644 --- a/modules/jupyter/tests/test_hub.py +++ b/modules/jupyter/tests/test_hub.py @@ -4,6 +4,12 @@ from packaging.version import Version as V +config_file = "../jupyter_config/config-selfauth.yaml" +if len(sys.argv) == 3: + autopilot = (sys.argv[2] == "true") + if autopilot: + config_file = "../jupyter_config/config-selfauth-autopilot.yaml" + def test_hub_up(hub_url): r = requests.get(hub_url) @@ -32,7 +38,7 @@ def test_hub_login(hub_url): from /jupyter_config/config.yaml. After successfully login, user will be redirected to /hub/spawn. """ - with open("../jupyter_config/config-selfauth.yaml", "r") as yaml_file: + with open(config_file, "r") as yaml_file: data = yaml.safe_load(yaml_file) username = data["hub"]["config"]["Authenticator"]["admin_users"][0] diff --git a/modules/kuberay-cluster/main.tf b/modules/kuberay-cluster/main.tf index 33ec9a063..f77cc7203 100644 --- a/modules/kuberay-cluster/main.tf +++ b/modules/kuberay-cluster/main.tf @@ -177,6 +177,7 @@ resource "kubernetes_network_policy" "kuberay-cluster-allow-network-policy" { namespace_selector { match_labels = { "kubernetes.io/metadata.name" = var.namespace + "kubernetes.io/metadata.name" = "gke-gmp-system" } } }