diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb index 2b80e437e..0a12d9740 100644 --- a/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb +++ b/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb @@ -37,9 +37,9 @@ "id": "c7ff518d-f4d2-481b-b408-2c2507565611", "metadata": {}, "source": [ - "## Creating the Database Connection\n", + "## Download the Data\n", "\n", - "Let's now set up a connection to your CloudSQL database:" + "Let's now import required modules:" ] }, { @@ -60,42 +60,7 @@ "from datasets import load_dataset_builder, load_dataset, Dataset\n", "from huggingface_hub import snapshot_download\n", "from google.cloud.sql.connector import Connector, IPTypes\n", - "import sqlalchemy\n", - "\n", - "# initialize parameters\n", - "\n", - "INSTANCE_CONNECTION_NAME = os.environ[\"CLOUDSQL_INSTANCE_CONNECTION_NAME\"]\n", - "print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n", - "DB_NAME = \"pgvector-database\"\n", - "\n", - "db_username_file = open(\"/etc/secret-volume/username\", \"r\")\n", - "DB_USER = db_username_file.read()\n", - "db_username_file.close()\n", - "\n", - "db_password_file = open(\"/etc/secret-volume/password\", \"r\")\n", - "DB_PASS = db_password_file.read()\n", - "db_password_file.close()\n", - "\n", - "# initialize Connector object\n", - "connector = Connector()\n", - "\n", - "# function to return the database connection object\n", - "def getconn():\n", - " conn = connector.connect(\n", - " INSTANCE_CONNECTION_NAME,\n", - " \"pg8000\",\n", - " user=DB_USER,\n", - " password=DB_PASS,\n", - " db=DB_NAME,\n", - " ip_type=IPTypes.PRIVATE\n", - " )\n", - " return conn\n", - "\n", - "# create connection pool with 'creator' argument to our connection object function\n", - "pool = sqlalchemy.create_engine(\n", - " \"postgresql+pg8000://\",\n", - " creator=getconn,\n", - ")" + "import sqlalchemy" ] }, { @@ -322,6 +287,40 @@ "from sqlalchemy.orm import scoped_session, sessionmaker, mapped_column\n", "from pgvector.sqlalchemy import Vector\n", "\n", + "# initialize parameters\n", + "\n", + "INSTANCE_CONNECTION_NAME = os.environ[\"CLOUDSQL_INSTANCE_CONNECTION_NAME\"]\n", + "print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n", + "DB_NAME = \"pgvector-database\"\n", + "\n", + "db_username_file = open(\"/etc/secret-volume/username\", \"r\")\n", + "DB_USER = db_username_file.read()\n", + "db_username_file.close()\n", + "\n", + "db_password_file = open(\"/etc/secret-volume/password\", \"r\")\n", + "DB_PASS = db_password_file.read()\n", + "db_password_file.close()\n", + "\n", + "# initialize Connector object\n", + "connector = Connector()\n", + "\n", + "# function to return the database connection object\n", + "def getconn():\n", + " conn = connector.connect(\n", + " INSTANCE_CONNECTION_NAME,\n", + " \"pg8000\",\n", + " user=DB_USER,\n", + " password=DB_PASS,\n", + " db=DB_NAME,\n", + " ip_type=IPTypes.PRIVATE\n", + " )\n", + " return conn\n", + "\n", + "# create connection pool with 'creator' argument to our connection object function\n", + "pool = sqlalchemy.create_engine(\n", + " \"postgresql+pg8000://\",\n", + " creator=getconn,\n", + ")\n", "\n", "Base = declarative_base()\n", "DBSession = scoped_session(sessionmaker())\n", diff --git a/applications/rag/main.tf b/applications/rag/main.tf index 575de4f71..a75c46600 100644 --- a/applications/rag/main.tf +++ b/applications/rag/main.tf @@ -154,20 +154,9 @@ module "namespace" { module "gcs" { source = "../../modules/gcs" - count = var.create_gcs_bucket ? 1 : 0 + count = 0 project_id = var.project_id - bucket_name = var.gcs_bucket -} - -module "cloudsql" { - source = "../../modules/cloudsql" - providers = { kubernetes = kubernetes.rag } - project_id = var.project_id - instance_name = local.cloudsql_instance - namespace = local.kubernetes_namespace - region = local.cloudsql_instance_region - network_name = local.network_name - depends_on = [module.namespace] + bucket_name = "gke-aieco-rag-a236619-2ec62154" } module "jupyterhub" { @@ -175,7 +164,7 @@ module "jupyterhub" { providers = { helm = helm.rag, kubernetes = kubernetes.rag } namespace = local.kubernetes_namespace project_id = var.project_id - gcs_bucket = var.gcs_bucket + gcs_bucket = "gke-aieco-rag-a236619-2ec62154" add_auth = var.jupyter_add_auth additional_labels = var.additional_labels @@ -205,117 +194,3 @@ module "jupyterhub" { depends_on = [module.namespace, module.gcs] } - -module "kuberay-workload-identity" { - providers = { kubernetes = kubernetes.rag } - source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity" - version = "30.0.0" # Pinning to a previous version as current version (30.1.0) showed inconsitent behaviour with workload identity service accounts - use_existing_gcp_sa = !var.create_ray_service_account - name = local.ray_service_account - namespace = local.kubernetes_namespace - project_id = var.project_id - roles = ["roles/cloudsql.client", "roles/monitoring.viewer"] - automount_service_account_token = true - depends_on = [module.namespace] -} - -module "kuberay-monitoring" { - source = "../../modules/kuberay-monitoring" - providers = { helm = helm.rag, kubernetes = kubernetes.rag } - project_id = var.project_id - autopilot_cluster = local.enable_autopilot - namespace = local.kubernetes_namespace - create_namespace = true - enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard - k8s_service_account = local.ray_service_account - depends_on = [module.namespace, module.kuberay-workload-identity] -} - -module "kuberay-cluster" { - source = "../../modules/kuberay-cluster" - providers = { helm = helm.rag, kubernetes = kubernetes.rag } - project_id = var.project_id - namespace = local.kubernetes_namespace - enable_gpu = true - gcs_bucket = var.gcs_bucket - autopilot_cluster = local.enable_autopilot - cloudsql_instance_name = local.cloudsql_instance - db_region = local.cloudsql_instance_region - google_service_account = local.ray_service_account - disable_network_policy = var.disable_ray_cluster_network_policy - use_custom_image = true - additional_labels = var.additional_labels - - # Implicit dependency - db_secret_name = module.cloudsql.db_secret_name - grafana_host = module.kuberay-monitoring.grafana_uri - - # IAP Auth parameters - add_auth = var.ray_dashboard_add_auth - create_brand = var.create_brand - support_email = var.support_email - client_id = var.ray_dashboard_client_id - client_secret = var.ray_dashboard_client_secret - k8s_ingress_name = var.ray_dashboard_k8s_ingress_name - k8s_iap_secret_name = var.ray_dashboard_k8s_iap_secret_name - k8s_managed_cert_name = var.ray_dashboard_k8s_managed_cert_name - k8s_backend_config_name = var.ray_dashboard_k8s_backend_config_name - k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port - domain = var.ray_dashboard_domain - members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : [] - depends_on = [module.gcs, module.kuberay-workload-identity] -} - -module "inference-server" { - source = "../../modules/inference-service" - providers = { kubernetes = kubernetes.rag } - namespace = local.kubernetes_namespace - additional_labels = var.additional_labels - autopilot_cluster = local.enable_autopilot - depends_on = [module.namespace] -} - -module "frontend" { - source = "./frontend" - providers = { helm = helm.rag, kubernetes = kubernetes.rag } - project_id = var.project_id - create_service_account = var.create_rag_service_account - google_service_account = local.rag_service_account - namespace = local.kubernetes_namespace - additional_labels = var.additional_labels - inference_service_endpoint = module.inference-server.inference_service_endpoint - cloudsql_instance = module.cloudsql.instance - cloudsql_instance_region = local.cloudsql_instance_region - db_secret_name = module.cloudsql.db_secret_name - dataset_embeddings_table_name = var.dataset_embeddings_table_name - - # IAP Auth parameters - add_auth = var.frontend_add_auth - create_brand = var.create_brand - support_email = var.support_email - client_id = var.frontend_client_id - client_secret = var.frontend_client_secret - k8s_ingress_name = var.frontend_k8s_ingress_name - k8s_managed_cert_name = var.frontend_k8s_managed_cert_name - k8s_iap_secret_name = var.frontend_k8s_iap_secret_name - k8s_backend_config_name = var.frontend_k8s_backend_config_name - k8s_backend_service_name = var.frontend_k8s_backend_service_name - k8s_backend_service_port = var.frontend_k8s_backend_service_port - domain = var.frontend_domain - members_allowlist = var.frontend_members_allowlist != "" ? split(",", var.frontend_members_allowlist) : [] - depends_on = [module.namespace] -} - -resource "helm_release" "gmp-apps" { - name = "gmp-apps" - provider = helm.rag - chart = "../../charts/gmp-engine/" - namespace = local.kubernetes_namespace - # Timeout is increased to guarantee sufficient scale-up time for Autopilot nodes. - timeout = 1200 - depends_on = [module.inference-server, module.frontend] - values = [ - "${file("${path.module}/podmonitoring.yaml")}" - ] -} - diff --git a/cloudbuild.yaml b/cloudbuild.yaml index d5ea41498..2432569a7 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -12,376 +12,90 @@ # See the License for the specific language governing permissions and # limitations under the License. -steps: - - id: "validate platform" - name: "gcr.io/$PROJECT_ID/terraform" - script: | - terraform init -no-color - terraform validate -no-color - dir: "infrastructure/" - waitFor: ["-"] - - - id: "validate ray" - name: "gcr.io/$PROJECT_ID/terraform" - script: | - terraform init -no-color - terraform validate -no-color - dir: "applications/ray/" - waitFor: ["validate platform"] - - - id: "validate jupyterhub" - name: "gcr.io/$PROJECT_ID/terraform" - script: | - terraform init -no-color - terraform validate -no-color - dir: "applications/jupyter/" - waitFor: ["validate platform"] - - - id: "validate rag" - name: "gcr.io/$PROJECT_ID/terraform" - script: | - terraform init -no-color - terraform validate -no-color - dir: "applications/rag/" - waitFor: ["validate platform"] - - # Create cluster to test ray, jupyterhub, rag - # - id: 'create gke cluster' - # name: 'gcr.io/$PROJECT_ID/terraform' - # env: - # - "KUBE_LOAD_CONFIG_FILE=false" - # entrypoint: 'sh' - # args: - # - '-c' - # - | - # set -e - - # terraform apply \ - # -var-file=tfvars_tests/standard-gke-public.platform.tfvars \ - # -var=project_id=$PROJECT_ID \ - # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=subnetwork_region=$_REGION \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \ - # -var=cluster_location=$_REGION \ - # -var='cpu_pools=[{initial_node_count=2,name="cpu-pool",machine_type="n1-standard-16",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-standard",}]' \ - # -var='gpu_pools=[{initial_node_count=2,name="gpu-pool",machine_type="g2-standard-24",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-balanced",accelerator_count=2,accelerator_type="nvidia-l4",gpu_driver_version="DEFAULT",}]' \ - # -auto-approve -no-color - # echo "pass" > /workspace/gke_cluster_result.txt - # dir: 'infrastructure/' - # allowFailure: true - # waitFor: ['validate platform', 'validate ray', 'validate jupyterhub', validate rag] - - # - id: 'test ray cluster' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'sh' - # args: - # - '-c' - # - | - # set -e - - # # Get kube config - # gcloud container clusters get-credentials \ - # ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # --location $_REGION \ - # --project $PROJECT_ID - - # cd /workspace/applications/ray/ - # terraform apply \ - # -var-file=workloads.tfvars \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-ray \ - # -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \ - # -var=enable_gpu=true \ - # -auto-approve -no-color - # echo "pass" > /workspace/user_result.txt - - # chmod +x /workspace/scripts/ci/wait_for_pods.sh - # /workspace/scripts/ci/wait_for_pods.sh ml-$SHORT_SHA-$_BUILD_ID-ray 3000 - - # kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID-ray --for=condition=Ready --timeout=1200s - # # Ray head's readinessProbe is not probing the head service today. Therefore the wait for ready above is not reliable. - # sleep 60s - # kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-ray service/ray-cluster-kuberay-head-svc 8265:8265 & - # # Wait port-forwarding to take its place - # sleep 10s - - # ray job submit \ - # --address=http://127.0.0.1:8265 -- python -c "import ray; ray.init(); print(ray.cluster_resources())" - # echo "pass" > /workspace/ray_result.txt - # allowFailure: true - # waitFor: ['create gke cluster'] - - # - id: 'cleanup ray cluster' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # set -e - - # cd /workspace/applications/ray/ - # terraform destroy \ - # -var-file=workloads.tfvars \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-ray \ - # -var=workload_identity_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-ray-$SHORT_SHA-$_BUILD_ID \ - # -var=enable_gpu=true \ - # -auto-approve -no-color - # allowFailure: true - # waitFor: ['test ray cluster'] - - # - id: 'test jupyterhub' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # set -e - - # cd /workspace/modules/jupyter/tests - # python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER - - # cd /workspace/applications/jupyter - # terraform apply \ - # -var-file=workloads-without-iap.example.tfvars \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \ - # -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \ - # -auto-approve -no-color - # echo "pass" > /workspace/jupyterhub_tf_result.txt - - # kubectl wait --for=condition=Ready pods -n ml-$SHORT_SHA-$_BUILD_ID-jupyter -l 'component!=continuous-image-puller' --timeout=1800s - # kubectl get services -n ml-$SHORT_SHA-$_BUILD_ID-jupyter - # kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-jupyter service/proxy-public 9442:80 & - # # Wait port-forwarding to take its place - # sleep 5s - - # cd /workspace/modules/jupyter/tests - # python3 test_hub.py "127.0.0.1:9442" $_AUTOPILOT_CLUSTER - # echo "pass" > /workspace/jupyterhub_test_result.txt - # allowFailure: true - # waitFor: ['create gke cluster'] - - # - id: 'cleanup jupyterhub' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # set -e - - # cd /workspace/applications/jupyter/ - # terraform destroy \ - # -var-file=workloads-without-iap.example.tfvars \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=ml-$SHORT_SHA-$_BUILD_ID-jupyter \ - # -var=workload_identity_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-jupyter-$SHORT_SHA-$_BUILD_ID \ - # -auto-approve -no-color - # allowFailure: true - # waitFor: ['test jupyterhub'] - - # - id: 'test rag' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'sh' - # secretEnv: ['KAGGLE_USERNAME', 'KAGGLE_KEY'] - # args: - # - '-c' - # - | - # set -e - - # # Get kube config - # gcloud container clusters get-credentials \ - # ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # --location $_REGION \ - # --project $PROJECT_ID - - # cd /workspace/modules/jupyter/tests - # python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER - - # cd /workspace/applications/rag/ - # terraform apply \ - # -var-file=workloads.tfvars \ - # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=create_cluster=false \ - # -var=jupyter_add_auth=false \ - # -var=frontend_add_auth=false \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ - # -var=ray_service_account=ray-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ - # -var=rag_service_account=rag-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ - # -var=jupyter_service_account=jupyter-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ - # -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ - # -auto-approve -no-color - # echo "pass" > /workspace/rag_tf_result.txt - - # # Validate Ray: Make sure pods are running - # kubectl wait --for=condition=Ready pods -n rag-$SHORT_SHA-$_BUILD_ID -l 'component!=continuous-image-puller' --timeout=1200s - # kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/ray-cluster-kuberay-head-svc 8262:8265 & - # # Wait port-forwarding to take its place - # sleep 5s - - # # Validate Ray: Check dashboard - # ray job submit --working-dir ./tests \ - # --address=http://127.0.0.1:8262 -- python -c "import ray; ray.init(); print(ray.cluster_resources())" - # echo "pass" > /workspace/rag_ray_dashboard_result.txt - - # # Validate JupyterHub: Get hub url - # kubectl get services -n rag-$SHORT_SHA-$_BUILD_ID - # kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/proxy-public 9443:80 & - # # Wait port-forwarding to take its place - # sleep 5s - - # # Validate JupyterHub: Test Hub - # cd /workspace/modules/jupyter/tests - # python3 test_hub.py "127.0.0.1:9443" $_AUTOPILOT_CLUSTER - # echo "pass" > /workspace/rag_jupyterhub_test_result.txt - - # # Validate RAG: Test rag frontend - # kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/rag-frontend 8081:8080 & - # # Wait port-forwarding to take its place - # sleep 5s - - # cd /workspace/applications/rag/tests - # python3 test_frontend.py "127.0.0.1:8081" - # echo "pass" > /workspace/rag_frontend_result.txt - - # cd /workspace/ - # sed -i "s//$$KAGGLE_USERNAME/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb - # sed -i "s//$$KAGGLE_KEY/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb - # gsutil cp ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb gs://gke-aieco-rag-$SHORT_SHA-$_BUILD_ID/ - # kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID $(kubectl get pod -l app=jupyterhub,component=hub -n rag-$SHORT_SHA-$_BUILD_ID -o jsonpath="{.items[0].metadata.name}") -- jupyterhub token admin --log-level=CRITICAL | xargs python3 ./applications/rag/notebook_starter.py - # # Wait for jupyterhub to trigger notebook pod startup - # sleep 5s - # kubectl wait --for=condition=Ready pod/jupyter-admin -n rag-$SHORT_SHA-$_BUILD_ID --timeout=500s - # kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- jupyter nbconvert --to script /data/rag-kaggle-ray-sql-interactive.ipynb - # kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- ipython /data/rag-kaggle-ray-sql-interactive.py - - # python3 ./applications/rag/tests/test_rag.py "http://127.0.0.1:8081/prompt" - # echo "pass" > /workspace/rag_prompt_result.txt - - # allowFailure: true - # waitFor: ['create gke cluster'] - - # - id: 'cleanup rag' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # set -e - - # cd /workspace/applications/rag/ - # terraform destroy \ - # -var-file=workloads.tfvars \ - # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=create_cluster=false \ - # -var=jupyter_add_auth=false \ - # -var=frontend_add_auth=false \ - # -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=cluster_location=$_REGION \ - # -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ - # -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ - # -var=ray_service_account=ray-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=rag_service_account=rag-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=jupyter_service_account=jupyter-sa-$SHORT_SHA-$_BUILD_ID \ - # -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ - # -auto-approve -no-color - # allowFailure: true - # waitFor: ['test rag'] - - # - id: 'cleanup gke cluster' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # set -e - - # cd /workspace/infrastructure - # terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \ - # -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ - # -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ - # -var=autopilot_cluster=$_AUTOPILOT_CLUSTER \ - # -var=cluster_location=$_REGION -auto-approve -no-color - - # allowFailure: true - # waitFor: ['cleanup rag', 'cleanup jupyterhub', 'cleanup ray cluster'] - - # - id: 'check result' - # name: 'gcr.io/$PROJECT_ID/terraform' - # entrypoint: 'bash' - # args: - # - '-c' - # - | - # if [[ $(cat /workspace/gke_cluster_result.txt) != "pass" ]]; then - # echo "gke cluster creation failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/ray_result.txt) != "pass" ]]; then - # echo "ray API run failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/user_result.txt) != "pass" ]]; then - # echo "ray cluster failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/jupyterhub_tf_result.txt) != "pass" ]]; then - # echo "jupyterhub tf failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/jupyterhub_test_result.txt) != "pass" ]]; then - # echo "jupyterhub test failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/rag_tf_result.txt) != "pass" ]]; then - # echo "rag tf failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/rag_ray_dashboard_result.txt) != "pass" ]]; then - # echo "rag ray dashboard test failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/rag_jupyterhub_test_result.txt) != "pass" ]]; then - # echo "rag jupyterhub test failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/rag_frontend_result.txt) != "pass" ]]; then - # echo "rag frontend test failed" - # exit 1 - # fi - - # if [[ $(cat /workspace/rag_prompt_result.txt) != "pass" ]]; then - # echo "rag prompt test failed" - # exit 1 - # fi - # waitFor: ['cleanup gke cluster'] +- id: 'test rag' + name: 'gcr.io/$PROJECT_ID/terraform' + entrypoint: 'sh' + secretEnv: ['KAGGLE_USERNAME', 'KAGGLE_KEY'] + args: + - '-c' + - | + set -e + + # Get kube config + gcloud container clusters get-credentials \ + ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + --location $_REGION \ + --project $PROJECT_ID + + cd /workspace/modules/jupyter/tests + python3 change_jupyter_config.py $_AUTOPILOT_CLUSTER + + cd /workspace/applications/rag/ + terraform apply \ + -var-file=workloads.tfvars \ + -var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \ + -var=create_cluster=false \ + -var=jupyter_add_auth=false \ + -var=frontend_add_auth=false \ + -var=project_id=$PROJECT_ID \ + -var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \ + -var=cluster_location=$_REGION \ + -var=kubernetes_namespace=rag-$SHORT_SHA-$_BUILD_ID \ + -var=gcs_bucket=gke-aieco-rag-$SHORT_SHA-$_BUILD_ID \ + -var=ray_service_account=ray-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ + -var=rag_service_account=rag-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ + -var=jupyter_service_account=jupyter-sa-4-rag-$SHORT_SHA-$_BUILD_ID \ + -var=cloudsql_instance=pgvector-instance-$SHORT_SHA-$_BUILD_ID \ + -auto-approve -no-color + echo "pass" > /workspace/rag_tf_result.txt + + # Validate Ray: Make sure pods are running + kubectl wait --for=condition=Ready pods -n rag-$SHORT_SHA-$_BUILD_ID -l 'component!=continuous-image-puller' --timeout=1200s + kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/ray-cluster-kuberay-head-svc 8262:8265 & + # Wait port-forwarding to take its place + sleep 5s + + # Validate Ray: Check dashboard + ray job submit --working-dir ./tests \ + --address=http://127.0.0.1:8262 -- python -c "import ray; ray.init(); print(ray.cluster_resources())" + echo "pass" > /workspace/rag_ray_dashboard_result.txt + + # Validate JupyterHub: Get hub url + kubectl get services -n rag-$SHORT_SHA-$_BUILD_ID + kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/proxy-public 9443:80 & + # Wait port-forwarding to take its place + sleep 5s + + # Validate JupyterHub: Test Hub + cd /workspace/modules/jupyter/tests + python3 test_hub.py "127.0.0.1:9443" $_AUTOPILOT_CLUSTER + echo "pass" > /workspace/rag_jupyterhub_test_result.txt + + # Validate RAG: Test rag frontend + kubectl port-forward -n rag-$SHORT_SHA-$_BUILD_ID service/rag-frontend 8081:8080 & + # Wait port-forwarding to take its place + sleep 5s + + cd /workspace/applications/rag/tests + python3 test_frontend.py "127.0.0.1:8081" + echo "pass" > /workspace/rag_frontend_result.txt + + cd /workspace/ + sed -i "s//$$KAGGLE_USERNAME/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb + sed -i "s//$$KAGGLE_KEY/g" ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb + gsutil cp ./applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb gs://gke-aieco-rag-$SHORT_SHA-$_BUILD_ID/ + kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID $(kubectl get pod -l app=jupyterhub,component=hub -n rag-$SHORT_SHA-$_BUILD_ID -o jsonpath="{.items[0].metadata.name}") -- jupyterhub token admin --log-level=CRITICAL | xargs python3 ./applications/rag/notebook_starter.py + # Wait for jupyterhub to trigger notebook pod startup + sleep 5s + kubectl wait --for=condition=Ready pod/jupyter-admin -n rag-$SHORT_SHA-$_BUILD_ID --timeout=500s + kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- jupyter nbconvert --to script /data/rag-kaggle-ray-sql-interactive.ipynb + kubectl exec -it -n rag-$SHORT_SHA-$_BUILD_ID jupyter-admin -c notebook -- ipython /data/rag-kaggle-ray-sql-interactive.py + + python3 ./applications/rag/tests/test_rag.py "http://127.0.0.1:8081/prompt" + echo "pass" > /workspace/rag_prompt_result.txt + + allowFailure: true + waitFor: ['create gke cluster'] substitutions: _REGION: us-east4 @@ -393,9 +107,9 @@ options: substitutionOption: "ALLOW_LOOSE" machineType: "E2_HIGHCPU_8" timeout: 5400s -# availableSecrets: -# secretManager: -# - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-username/versions/latest -# env: "KAGGLE_USERNAME" -# - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-key/versions/latest -# env: "KAGGLE_KEY" +availableSecrets: + secretManager: + - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-username/versions/latest + env: "KAGGLE_USERNAME" + - versionName: projects/gke-ai-eco-dev/secrets/cloudbuild-kaggle-key/versions/latest + env: "KAGGLE_KEY"