Skip to content

Commit

Permalink
delete kuberay-logging and kuberay-operator module
Browse files Browse the repository at this point in the history
Change-Id: If63c090e99186df2fb5ed27c37d187cd75cbf663
  • Loading branch information
Gen Lu committed Aug 29, 2024
1 parent d15b816 commit ad373c1
Show file tree
Hide file tree
Showing 13 changed files with 20 additions and 757 deletions.
34 changes: 17 additions & 17 deletions applications/rag/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -64,25 +64,25 @@ module "infra" {
source = "../../infrastructure"
count = var.create_cluster ? 1 : 0

project_id = var.project_id
cluster_name = local.cluster_name
cluster_location = var.cluster_location
region = local.cluster_location_region
autopilot_cluster = var.autopilot_cluster
private_cluster = var.private_cluster
create_network = var.create_network
network_name = local.network_name
subnetwork_name = local.network_name
subnetwork_cidr = var.subnetwork_cidr
subnetwork_region = local.cluster_location_region
cpu_pools = var.cpu_pools
enable_gpu = true
gpu_pools = var.gpu_pools
ray_addon_enabled = true
project_id = var.project_id
cluster_name = local.cluster_name
cluster_location = var.cluster_location
region = local.cluster_location_region
autopilot_cluster = var.autopilot_cluster
private_cluster = var.private_cluster
create_network = var.create_network
network_name = local.network_name
subnetwork_name = local.network_name
subnetwork_cidr = var.subnetwork_cidr
subnetwork_region = local.cluster_location_region
cpu_pools = var.cpu_pools
enable_gpu = true
gpu_pools = var.gpu_pools
ray_addon_enabled = true
# TODO(genlu): remove channel and k8s_version after ray addon is in REGULAR channel
release_channel = "RAPID"
release_channel = "RAPID"
kubernetes_version = "1.30.3-gke.1969000"
depends_on = [module.project-services]
depends_on = [module.project-services]
}

data "google_container_cluster" "default" {
Expand Down
273 changes: 3 additions & 270 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,22 +29,6 @@ steps:
dir: 'applications/ray/'
waitFor: ['validate platform']

- id: 'validate jupyterhub'
name: 'gcr.io/$PROJECT_ID/terraform'
script: |
terraform init -no-color
terraform validate -no-color
dir: 'applications/jupyter/'
waitFor: ['validate platform']

- id: 'validate rag'
name: 'gcr.io/$PROJECT_ID/terraform'
script: |
terraform init -no-color
terraform validate -no-color
dir: 'applications/rag/'
waitFor: ['validate platform']

# Create cluster to test ray, jupyterhub, rag
- id: 'create gke cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
Expand All @@ -71,7 +55,7 @@ steps:
echo "pass" > /workspace/gke_cluster_result.txt
dir: 'infrastructure/'
allowFailure: true
waitFor: ['validate platform', 'validate ray', 'validate jupyterhub', validate rag]
waitFor: ['validate platform', 'validate ray']

- id: 'test ray cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
Expand Down Expand Up @@ -114,216 +98,6 @@ steps:
allowFailure: true
waitFor: ['create gke cluster']

- id: 'cleanup ray cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
set -e
cd /workspace/applications/ray/
terraform destroy \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=cluster_location=$_REGION \
-var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-ray \
-var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \
-var=enable_gpu=true \
-auto-approve -no-color
allowFailure: true
waitFor: ['test ray cluster']

- id: 'test jupyterhub'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
set -e
cd /workspace/modules/jupyter/tests
python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER
cd /workspace/applications/jupyter
terraform apply \
-var-file=workloads-without-iap.example.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=cluster_location=$_REGION \
-var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color
echo "pass" > /workspace/jupyterhub_tf_result.txt
kubectl wait --for=condition=Ready pods -n ml-$SHORT_SHA-$_BUILD_ID-jupyter -l 'component!=continuous-image-puller' --timeout=1800s
kubectl get services -n ml-$SHORT_SHA-$_BUILD_ID-jupyter
kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-jupyter service/proxy-public 9442:80 &
# Wait port-forwarding to take its place
sleep 5s
cd /workspace/modules/jupyter/tests
python3 test_hub.py "127.0.0.1:9442" $_AUTOPILOT_CLUSTER
echo "pass" > /workspace/jupyterhub_test_result.txt
allowFailure: true
waitFor: ['create gke cluster']

- id: 'cleanup jupyterhub'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
set -e
cd /workspace/applications/jupyter/
terraform destroy \
-var-file=workloads-without-iap.example.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=cluster_location=$_REGION \
-var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \
-var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color
allowFailure: true
waitFor: ['test jupyterhub']

- id: 'test rag'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'sh'
secretEnv: ['KAGGLE_USERNAME', 'KAGGLE_KEY']
args:
- '-c'
- |
set -e
# Get kube config
gcloud container clusters get-credentials \
ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
--location $_REGION \
--project $PROJECT_ID
cd /workspace/modules/jupyter/tests
python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER
cd /workspace/applications/rag/
terraform apply \
-var-file=workloads.tfvars \
-var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
-var=create_cluster=false \
-var=jupyter_add_auth=false \
-var=frontend_add_auth=false \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=cluster_location=$_REGION \
-var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \
-var=ray_service_account=ray-sa-4-rag-$SHORT_SHA-$_BUILD_ID \
-var=rag_service_account=rag-sa-4-rag-$SHORT_SHA-$_BUILD_ID \
-var=jupyter_service_account=jupyter-sa-4-rag-$SHORT_SHA-$_BUILD_ID \
-var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color
echo "pass" > /workspace/rag_tf_result.txt
# Validate Ray: Make sure pods are running
kubectl wait --for=condition=Ready pods -n rag-$SHORT_SHA-$_BUILD_ID -l 'component!=continuous-image-puller' --timeout=1200s
kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/ray-cluster-kuberay-head-svc 8262:8265 &
# Wait port-forwarding to take its place
sleep 5s
# Validate Ray: Check dashboard
ray job submit --working-dir ./tests \
--address=http://127.0.0.1:8262 -- python -c "import ray; ray.init(); print(ray.cluster_resources())"
echo "pass" > /workspace/rag_ray_dashboard_result.txt
# Validate JupyterHub: Get hub url
kubectl get services -n rag-$SHORT_SHA-$_BUILD_ID
kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/proxy-public 9443:80 &
# Wait port-forwarding to take its place
sleep 5s
# Validate JupyterHub: Test Hub
cd /workspace/modules/jupyter/tests
python3 test_hub.py "127.0.0.1:9443" $_AUTOPILOT_CLUSTER
echo "pass" > /workspace/rag_jupyterhub_test_result.txt
# Validate RAG: Test rag frontend
kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/rag-frontend 8081:8080 &
# Wait port-forwarding to take its place
sleep 5s
cd /workspace/applications/rag/tests
python3 test_frontend.py "127.0.0.1:8081"
echo "pass" > /workspace/rag_frontend_result.txt
cd /workspace/
sed -i "s/<username>/$$KAGGLE_USERNAME/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb
sed -i "s/<token>/$$KAGGLE_KEY/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb
gsutil cp ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb gs://gke-aieco-rag-$SHORT_SHA-$_BUILD_ID/
kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID $(kubectl get pod -l app=jupyterhub,component=hub -n rag-$SHORT_SHA-$_BUILD_ID -o jsonpath="{.items[0].metadata.name}") -- jupyterhub token admin --log-level=CRITICAL | xargs python3 ./applications/rag/notebook_starter.py
# Wait for jupyterhub to trigger notebook pod startup
sleep 5s
kubectl wait --for=condition=Ready pod/jupyter-admin -n rag-$SHORT_SHA-$_BUILD_ID --timeout=500s
kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- jupyter nbconvert --to script /data/rag-kaggle-ray-sql-interactive.ipynb
kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- ipython /data/rag-kaggle-ray-sql-interactive.py
python3 ./applications/rag/tests/test_rag.py "http://127.0.0.1:8081/prompt"
echo "pass" > /workspace/rag_prompt_result.txt
allowFailure: true
waitFor: ['create gke cluster']

- id: 'cleanup rag'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
set -e
cd /workspace/applications/rag/
terraform destroy \
-var-file=workloads.tfvars \
-var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
-var=create_cluster=false \
-var=jupyter_add_auth=false \
-var=frontend_add_auth=false \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=cluster_location=$_REGION \
-var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \
-var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \
-var=ray_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \
-var=rag_service_account=rag-sa-$SHORT_SHA-$_BUILD_ID \
-var=jupyter_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \
-var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \
-auto-approve -no-color
allowFailure: true
waitFor: ['test rag']

- id: 'cleanup gke cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
set -e
cd /workspace/infrastructure
terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
-var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
-var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-var=cluster_location=$_REGION -auto-approve -no-color
allowFailure: true
waitFor: ['cleanup rag', 'cleanup jupyterhub', 'cleanup ray cluster']

- id: 'check result'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
Expand All @@ -345,41 +119,7 @@ steps:
exit 1
fi
if [[ $(cat /workspace/jupyterhub_tf_result.txt) != "pass" ]]; then
echo "jupyterhub tf failed"
exit 1
fi
if [[ $(cat /workspace/jupyterhub_test_result.txt) != "pass" ]]; then
echo "jupyterhub test failed"
exit 1
fi
if [[ $(cat /workspace/rag_tf_result.txt) != "pass" ]]; then
echo "rag tf failed"
exit 1
fi
if [[ $(cat /workspace/rag_ray_dashboard_result.txt) != "pass" ]]; then
echo "rag ray dashboard test failed"
exit 1
fi
if [[ $(cat /workspace/rag_jupyterhub_test_result.txt) != "pass" ]]; then
echo "rag jupyterhub test failed"
exit 1
fi
if [[ $(cat /workspace/rag_frontend_result.txt) != "pass" ]]; then
echo "rag frontend test failed"
exit 1
fi
if [[ $(cat /workspace/rag_prompt_result.txt) != "pass" ]]; then
echo "rag prompt test failed"
exit 1
fi
waitFor: ['cleanup gke cluster']
waitFor: ['test ray cluster']

substitutions:
_REGION: us-east4
Expand All @@ -389,11 +129,4 @@ substitutions:
options:
substitutionOption: 'ALLOW_LOOSE'
machineType: 'E2_HIGHCPU_8'
timeout: 5400s

availableSecrets:
secretManager:
- versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-username/versions/latest
env: 'KAGGLE_USERNAME'
- versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-key/versions/latest
env: 'KAGGLE_KEY'
timeout: 5400s
18 changes: 0 additions & 18 deletions modules/kuberay-logging/config/fluent-bit.conf

This file was deleted.

4 changes: 0 additions & 4 deletions modules/kuberay-logging/config/parsers.conf

This file was deleted.

Empty file.
Loading

0 comments on commit ad373c1

Please sign in to comment.