Merge branch 'main' into bump-tgi

GoogleCloudPlatform · Mar 6, 2024 · 840ad2c · 840ad2c
2 parents afb8aed + ffd5a19
commit 840ad2c
Show file tree

Hide file tree

Showing 19 changed files with 355 additions and 74 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,35 @@
+name: Terraform CI
+on: 
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+jobs:
+  Terraform-Lint-Check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: hashicorp/setup-terraform@v3
+        with:
+          terraform_version: "1.5.7"
+
+      - name: Terraform fmt
+        id: fmt
+        run: terraform fmt -check -recursive
+
+      - name: Terraform Init
+        id: init
+        run: |
+          terraform -chdir=applications/rag init 
+          terraform -chdir=applications/ray init 
+          terraform -chdir=applications/jupyter init 
+
+      - name: Terraform Validate
+        id: validate
+        run: |
+          terraform -chdir=applications/rag validate -no-color
+          terraform -chdir=applications/ray validate -no-color
+          terraform -chdir=applications/jupyter validate -no-color
+
diff --git a/applications/rag/README.md b/applications/rag/README.md
@@ -32,7 +32,7 @@ CLUSTER_REGION=us-central1
 ```
 2. Use the following instructions to create a GKE cluster. We recommend using Autopilot for a simpler setup.
 
-##### Autopilot (recommended)
+##### Autopilot
 
 RAG requires the latest Autopilot features, available on GKE cluster version `1.29.1-gke.1575000`+
 ```
@@ -46,23 +46,9 @@ gcloud container clusters create-auto ${CLUSTER_NAME:?} \
   --cluster-version ${CLUSTER_VERSION:?}
 ```
 
-##### Standard
+##### Standard (recommended)
 
-1. To create a GKE Standard cluster using Terraform, please follow the [instructions here](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/infrastructure/README.md).
-
-TODO: Add GKE cluster requirements for a successful installation.
-
-2. The inference server requires L4 GPUs. Create an additional node pool:
-```
-gcloud container node-pools create g2-standard-24 --cluster ${CLUSTER_NAME:?} \
-  --accelerator type=nvidia-l4,count=2,gpu-driver-version=latest \
-  --machine-type g2-standard-24 \
-  --ephemeral-storage-local-ssd=count=2 \
- --enable-image-streaming \
- --num-nodes=1 --min-nodes=1 --max-nodes=2 \
- --node-locations ${CLUSTER_REGION:?}-a,${CLUSTER_REGION:?}-b \
- --location=${CLUSTER_REGION:?}
-```
+1. To create a GKE Standard cluster using Terraform, follow the [instructions here](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/infrastructure/README.md). Use the preconfigured node pools in `/infrastructure/platform.tfvars` as this solution requires T4s and L4s.
 
 #### Setup Components
 
@@ -105,7 +91,11 @@ gcloud container clusters get-credentials ${CLUSTER_NAME:?} --location ${CLUSTER
 1. Verify Kuberay is setup: run `kubectl get pods -n ${NAMESPACE:?}`. There should be a Ray head (and Ray worker pod on GKE Standard only) in `Running` state (prefixed by `example-cluster-kuberay-head-` and `example-cluster-kuberay-worker-workergroup-`).
 
 2. Verify Jupyterhub service is setup:
-    * Fetch the service IP: `kubectl get services proxy-public -n ${NAMESPACE:?} --output jsonpath='{.status.loadBalancer.ingress[0].ip}'`
+    * Fetch the service IP/Domain:
+      * IAP disabled: `kubectl get services proxy-public -n $NAMESPACE --output jsonpath='{.status.loadBalancer.ingress[0].ip}'`
+      * IAP enabled: Read terraform output `jupyter_uri` or use command: `kubectl get managedcertificates jupyter-managed-cert -n $NAMESPACE --output jsonpath='{.status.domainStatus[0].domain}'`
+          * Remember login [Google Cloud Platform IAP](https://pantheon.corp.google.com/security/iap) to check if user has role `IAP-secured Web App User`
+          * Wait for domain status to be `Active`
     * Go to the IP in a browser which should display the Jupyterlab login UI.
 
 3. Verify the instance `pgvector-instance` exists: `gcloud sql instances list | grep pgvector`
@@ -132,17 +122,28 @@ EOF
     * At the end of the smoke test with the TGI server, stop port forwarding by using Ctrl-C on the original terminal.
 
 5. Verify the frontend chat interface is setup:
-    * Verify the service exists: `kubectl get services rag-frontend -n ${NAMESPACE:?}`
-    * Verify the deployment exists: `kubectl get deployments rag-frontend -n ${NAMESPACE:?}` and ensure the deployment is in `READY` state.
+   * Verify the service exists: `kubectl get services rag-frontend -n ${NAMESPACE:?}`
+   * Verify the deployment exists: `kubectl get deployments rag-frontend -n ${NAMESPACE:?}` and ensure the deployment is in `READY` state.
+   * Verify the managed certificate is `Active`: 
+      ```
+     kubectl get managedcertificates frontend-managed-cert -n rag --output jsonpath='{.status.domainStatus[0].status}'
+      ```
+   * Verify IAP is enabled: 
+      ```
+      gcloud compute backend-services list --format="table(name, backends, iap.enabled)"
+      ```
 
 ### Vector Embeddings for Dataset
 
 This step generates the vector embeddings for your input dataset. Currently, the default dataset is [Google Maps Restaurant Reviews](https://www.kaggle.com/datasets/denizbilginn/google-maps-restaurant-reviews). We will use a Jupyter notebook to run a Ray job that generates the embeddings & populates them into the instance `pgvector-instance` created above.
 
 1. Create a CloudSQL user to access the database: `gcloud sql users create rag-user-notebook --password=<choose a password> --instance=pgvector-instance --host=%`
 
-2. Go to the Jupyterhub service endpoint in a browser: `kubectl get services proxy-public -n ${NAMESPACE:?} --output jsonpath='{.status.loadBalancer.ingress[0].ip}'`
-
+2. Go to the Jupyterhub service endpoint in a browser:       
+   * IAP disable: `kubectl get services proxy-public -n $NAMESPACE --output jsonpath='{.status.loadBalancer.ingress[0].ip}'`
+   * IAP enabled: Read terraform output `jupyter_uri` or use commend: `kubectl get managedcertificates jupyter-managed-cert -n $NAMESPACE --output jsonpath='{.status.domainStatus[0].domain}'`
+       * Remeber login GCP to check if user has role `IAP-secured Web App User`
+       * Waiting for domain status to be `Active`
 3. Login with placeholder credentials [TBD: replace with instructions for IAP]:
     * username: user
     * password: use `terraform output jupyter_password` to fetch the password value
@@ -166,11 +167,35 @@ This step generates the vector embeddings for your input dataset. Currently, the
 
 ### Launch the Frontend Chat Interface
 
-1. Setup port forwarding for the frontend [TBD: Replace with IAP]: `kubectl port-forward service/rag-frontend -n ${NAMESPACE:?} 8080:8080 &`
+#### Accessing the Frontend with IAP Disabled
+1. Setup port forwarding for the frontend: `kubectl port-forward service/rag-frontend -n $NAMESPACE 8080:8080 &`
 
 2. Go to `localhost:8080` in a browser & start chatting! This will fetch context related to your prompt from the vector embeddings in the `pgvector-instance`, augment the original prompt with the context & query the inference model (`mistral-7b`) with the augmented prompt.
 
-3. TODO: Add some example prompts for the dataset.
+#### Accessing the Frontend with IAP Enabled
+1. Verify IAP is Enabled
+
+   * Ensure that IAP is enabled on Google Cloud Platform (GCP) for your application. If you encounter any errors, try re-enabling IAP.
+
+2. Verify User Role
+
+   * Make sure you have the role `IAP-secured Web App User` assigned to your user account. This role is necessary to access the application through IAP.
+
+3. Verify Domain is Active
+   * Make sure the domain is active using commend:
+     `kubectl get managedcertificates frontend-managed-cert -n rag --output jsonpath='{.status.domainStatus[0].status}'`
+
+3. Retrieve the Domain
+
+   * Read terraform output `frontend_uri` or use the following command to find the domain created by IAP for accessing your service:
+     `kubectl get managedcertificates frontend-managed-cert -n $NAMESPACE --output jsonpath='{.status.domainStatus[0].domain}'`
+
+4. Access the Frontend
+
+   * Open your browser and navigate to the domain you retrieved in the previous step to start chatting!
+
+#### Prompts Example
+3. [TODO: Add some example prompts for the dataset].
 
 ### Cleanup
 

diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb
@@ -7,8 +7,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install langchain ray==2.7.1 datasets sentence-transformers kaggle\n",
-    "!pip install \"cloud-sql-python-connector[pg8000]\" SQLAlchemy==2.0.7"
+    "!pip install langchain==0.1.9 ray==2.7.1 datasets==2.18.0 sentence-transformers==2.5.1 kaggle==1.6.6\n",
+    "!pip install \"cloud-sql-python-connector[pg8000]==1.7.0\" SQLAlchemy==2.0.7"
    ]
   },
   {
@@ -294,14 +294,14 @@
     "    runtime_env={\n",
     "        \"working_dir\": \"/home/jovyan/test\", # upload the local working directory to ray workers\n",
     "        \"pip\": [\n",
-    "                \"langchain\",\n",
+    "                \"langchain==0.1.9\",\n",
     "                \"transformers\",\n",
-    "                \"sentence-transformers\",\n",
+    "                \"sentence-transformers==2.5.1\",\n",
     "                \"pyarrow\",\n",
-    "                \"datasets\",\n",
+    "                \"datasets==2.18.0\",\n",
     "                \"torch==2.0.1\",\n",
-    "                \"cloud-sql-python-connector[pg8000]\",\n",
-    "                \"SQLAlchemy\",\n",
+    "                \"cloud-sql-python-connector[pg8000]==1.7.0\",\n",
+    "                \"SQLAlchemy==2.0.7\",\n",
     "                \"huggingface_hub\",\n",
     "                ]\n",
     "    }\n",

diff --git a/applications/rag/main.tf b/applications/rag/main.tf
@@ -89,10 +89,10 @@ provider "helm" {
 }
 
 module "namespace" {
-  source = "../../modules/kubernetes-namespace"
-  providers              = { helm = helm.rag}
+  source           = "../../modules/kubernetes-namespace"
+  providers        = { helm = helm.rag }
   create_namespace = true
-  namespace = var.kubernetes_namespace
+  namespace        = var.kubernetes_namespace
 }
 
 module "kuberay-operator" {
@@ -115,12 +115,12 @@ module "gcs" {
 }
 
 module "cloudsql" {
-  source          = "../../modules/cloudsql"
-  providers       = { kubernetes = kubernetes.rag }
-  project_id      = var.project_id
-  instance_name   = var.cloudsql_instance
-  namespace       = var.kubernetes_namespace
-  depends_on      = [module.namespace]
+  source        = "../../modules/cloudsql"
+  providers     = { kubernetes = kubernetes.rag }
+  project_id    = var.project_id
+  instance_name = var.cloudsql_instance
+  namespace     = var.kubernetes_namespace
+  depends_on    = [module.namespace]
 }
 
 module "jupyterhub" {
@@ -181,6 +181,7 @@ module "kuberay-monitoring" {
   create_namespace                = true
   enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard
   k8s_service_account             = var.ray_service_account
+  depends_on                      = [module.namespace]
 }
 
 module "inference-server" {
@@ -199,6 +200,7 @@ module "frontend" {
   google_service_account        = var.rag_service_account
   namespace                     = var.kubernetes_namespace
   inference_service_endpoint    = module.inference-server.inference_service_endpoint
+  cloudsql_instance             = module.cloudsql.instance
   db_secret_name                = module.cloudsql.db_secret_name
   db_secret_namespace           = module.cloudsql.db_secret_namespace
   dataset_embeddings_table_name = var.dataset_embeddings_table_name
@@ -218,5 +220,5 @@ module "frontend" {
   url_domain_addr          = var.frontend_url_domain_addr
   url_domain_name          = var.frontend_url_domain_name
   members_allowlist        = var.frontend_members_allowlist
-  depends_on = [ module.namespace ]
+  depends_on               = [module.namespace]
 }
diff --git a/applications/rag/tests/test_frontend.py b/applications/rag/tests/test_frontend.py
@@ -0,0 +1,11 @@
+import sys
+import requests
+
+def test_frontend_up(rag_frontend_url):
+    r = requests.get(rag_frontend_url)
+    r.raise_for_status()
+    print("Rag frontend is up.")
+
+hub_url = "http://" + sys.argv[1]
+
+test_frontend_up(hub_url)
diff --git a/applications/rag/variables.tf b/applications/rag/variables.tf
@@ -209,7 +209,7 @@ variable "jupyter_k8s_backend_service_name" {
 
 variable "jupyter_k8s_backend_service_port" {
   type        = number
-  description = "NName of the Backend Service Port"
+  description = "Name of the Backend Service Port"
   default     = 80
 }
 
@@ -265,9 +265,9 @@ variable "autopilot_cluster" {
 }
 
 variable "cloudsql_instance" {
-  type    = string
+  type        = string
   description = "Name of the CloudSQL instance for RAG VectorDB"
-  default = "pgvector-instance"
+  default     = "pgvector-instance"
 }
 
 variable "cpu_pools" {

diff --git a/applications/rag/workloads.tfvars b/applications/rag/workloads.tfvars
@@ -38,7 +38,7 @@ rag_service_account        = "rag-system-account"
 
 # Creates a google service account & k8s service account & configures workload identity with appropriate permissions.
 # Set to false & update the variable `jupyter_service_account` to use an existing IAM service account.
-jupyter_service_account        = "jupyter-system-account"
+jupyter_service_account = "jupyter-system-account"
 
 ## Embeddings table name - change this to the TABLE_NAME used in the notebook.
 dataset_embeddings_table_name = "googlemaps_reviews_db"

diff --git a/...ain-examples/raytrain-with-gcsfusecsi/kuberaytf/user/modules/service_accounts/versions.tf b/...ain-examples/raytrain-with-gcsfusecsi/kuberaytf/user/modules/service_accounts/versions.tf
@@ -15,7 +15,7 @@
 terraform {
   required_providers {
     google = {
-      source  = "hashicorp/google"
+      source = "hashicorp/google"
     }
     kubernetes = {
       source  = "hashicorp/kubernetes"

diff --git a/applications/ray/versions.tf b/applications/ray/versions.tf
@@ -15,10 +15,10 @@
 terraform {
   required_providers {
     google = {
-      source  = "hashicorp/google"
+      source = "hashicorp/google"
     }
     google-beta = {
-      source  = "hashicorp/google-beta"
+      source = "hashicorp/google-beta"
     }
     helm = {
       source  = "hashicorp/helm"

diff --git a/benchmarks/benchmark/tools/locust-load-inference/sample-terraform.tfvars b/benchmarks/benchmark/tools/locust-load-inference/sample-terraform.tfvars
@@ -21,5 +21,5 @@ tokenizer                  = "tiiuae/falcon-7b"
 # Benchmark configuration for triggering single test via Locust Runner
 test_duration = 60
 # Increase test_users to allow more parallelism (especially when testing HPA)
-test_users    = 1
-test_rate     = 5
+test_users = 1
+test_rate  = 5