Skip to content

Commit

Permalink
fix: add e2e test for hg-tgi (#780)
Browse files Browse the repository at this point in the history
Change-Id: I454735bf758f5d3a28cb2a92e9c5e4c7161de8e5

Co-authored-by: Gen Lu <[email protected]>
  • Loading branch information
genlu2011 and Gen Lu committed Aug 29, 2024
1 parent b891db1 commit 203ddbc
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 1 deletion.
2 changes: 1 addition & 1 deletion tutorials-and-examples/hf-tgi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ gcloud container node-pools create g2-standard-24 --cluster l4-demo \
--num-nodes=1 --min-nodes=1 --max-nodes=2 \
--node-locations $REGION-a,$REGION-b --region $REGION
```
4. Provision the job and enable gathering metrics: `terrafrom apply`
4. Set the project_id in workloads.tfvars and create the application: `terrafrom apply -var-file=workloads.tfvars`
5. Make sure app started ok: `kubectl logs -l app=mistral-7b-instruct`
6. Set up port forward
```
Expand Down
162 changes: 162 additions & 0 deletions tutorials-and-examples/hf-tgi/cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

steps:
- id: 'validate platform'
name: 'gcr.io/$PROJECT_ID/terraform'
script: |
terraform init -no-color
terraform validate -no-color
dir: 'infrastructure/'
waitFor: ['-']

- id: 'validate hf'
name: 'gcr.io/$PROJECT_ID/terraform'
script: |
terraform init -no-color
terraform validate -no-color
dir: 'tutorials-and-examples/hf-tgi/'
waitFor: ['validate platform']

- id: 'create gke cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
env:
- "KUBE_LOAD_CONFIG_FILE=false"
entrypoint: 'sh'
args:
- '-c'
- |
set -e
terraform apply \
-var-file=tfvars_tests/standard-gke-public.platform.tfvars \
-var=project_id=$PROJECT_ID \
-var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
-var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
-var=subnetwork_region=$_REGION \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-var=cluster_location=$_REGION \
-var='cpu_pools=[{initial_node_count=2,name="cpu-pool",machine_type="n1-standard-16",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-standard",}]' \
-var='gpu_pools=[{initial_node_count=2,name="gpu-pool",machine_type="g2-standard-24",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-balanced",accelerator_count=2,accelerator_type="nvidia-l4",gpu_driver_version="DEFAULT",}]' \
-auto-approve -no-color
echo "pass" > /workspace/gke_cluster_result.txt
dir: 'infrastructure/'
allowFailure: true
waitFor: ['validate platform', 'validate hf']

- id: 'test hg-tgi'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'sh'
args:
- '-c'
- |
set -e
# Get kube config
gcloud container clusters get-credentials \
ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
--location $_REGION \
--project $PROJECT_ID
terraform apply \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=location=$_REGION \
-var=namespace=ml-$SHORT_SHA-$_BUILD_ID-hf \
-var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-auto-approve -no-color
# Make sure pods are running
kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID-hf --for=condition=Ready --timeout=1200s
kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-hf service/mistral-7b-instruct-service 8080:80 &
# Wait port-forwarding to take its place
sleep 10s
curl 127.0.0.1:8080/generate -X POST \
-H 'Content-Type: application/json' \
--data '{"inputs": "[INST]Hello world![/INST]","parameters": {"max_new_tokens": 400}}'
echo "pass" > /workspace/hf_result.txt
allowFailure: true
dir: 'tutorials-and-examples/hf-tgi/'
waitFor: ['create gke cluster']

- id: 'cleanup hf-tgi'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
set -e
cd /workspace/tutorials-and-examples/hf-tgi/
terraform destroy \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=location=$_REGION \
-var=namespace=ml-$SHORT_SHA-$_BUILD_ID-hf \
-var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-auto-approve -no-color
allowFailure: true
waitFor: ['test hg-tgi']

- id: 'cleanup gke cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
set -e
cd /workspace/infrastructure
terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
-var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
-var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-var=cluster_location=$_REGION -auto-approve -no-color
allowFailure: true
waitFor: ['cleanup hf-tgi']

- id: 'check result'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
if [[ $(cat /workspace/gke_cluster_result.txt) != "pass" ]]; then
echo "gke cluster creation failed"
exit 1
fi
if [[ $(cat /workspace/hf_result.txt) != "pass" ]]; then
echo "hf-gti test failed"
exit 1
fi
waitFor: ['cleanup gke cluster']

substitutions:
_REGION: us-east4
_USER_NAME: github
_AUTOPILOT_CLUSTER: "false"
_BUILD_ID: ${BUILD_ID:0:8}
options:
substitutionOption: 'ALLOW_LOOSE'
machineType: 'E2_HIGHCPU_8'
timeout: 5400s
8 changes: 8 additions & 0 deletions tutorials-and-examples/hf-tgi/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,18 @@ provider "helm" {
}
}

module "namespace" {
source = "../../modules/kubernetes-namespace"
create_namespace = true
namespace = var.namespace
}

module "inference-server" {
source = "../../modules/inference-service"
namespace = var.namespace
additional_labels = var.additional_labels
autopilot_cluster = var.autopilot_cluster
depends_on = [module.namespace]
}

resource "helm_release" "gmp-engine" {
Expand All @@ -68,4 +75,5 @@ resource "helm_release" "gmp-engine" {
values = [
"${file("${path.module}/podmonitoring.yaml")}"
]
depends_on = [module.namespace]
}
6 changes: 6 additions & 0 deletions tutorials-and-examples/hf-tgi/workloads.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
project_id = "<your project ID>"
cluster_name = "l4-demo"
location = "us-central1"

namespace = "l4-demo"
autopilot_cluster = false

0 comments on commit 203ddbc

Please sign in to comment.