Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: add e2e test for hg-tgi #780

Merged
merged 1 commit into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tutorials-and-examples/hf-tgi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ gcloud container node-pools create g2-standard-24 --cluster l4-demo \
--num-nodes=1 --min-nodes=1 --max-nodes=2 \
--node-locations $REGION-a,$REGION-b --region $REGION
```
4. Provision the job and enable gathering metrics: `terrafrom apply`
4. Set the project_id in workloads.tfvars and create the application: `terrafrom apply -var-file=workloads.tfvars`
5. Make sure app started ok: `kubectl logs -l app=mistral-7b-instruct`
6. Set up port forward
```
Expand Down
162 changes: 162 additions & 0 deletions tutorials-and-examples/hf-tgi/cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

steps:
- id: 'validate platform'
name: 'gcr.io/$PROJECT_ID/terraform'
script: |
terraform init -no-color
terraform validate -no-color
dir: 'infrastructure/'
waitFor: ['-']

- id: 'validate hf'
name: 'gcr.io/$PROJECT_ID/terraform'
script: |
terraform init -no-color
terraform validate -no-color
dir: 'tutorials-and-examples/hf-tgi/'
waitFor: ['validate platform']

- id: 'create gke cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
env:
- "KUBE_LOAD_CONFIG_FILE=false"
entrypoint: 'sh'
args:
- '-c'
- |
set -e

terraform apply \
-var-file=tfvars_tests/standard-gke-public.platform.tfvars \
-var=project_id=$PROJECT_ID \
-var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
-var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
-var=subnetwork_region=$_REGION \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-var=cluster_location=$_REGION \
-var='cpu_pools=[{initial_node_count=2,name="cpu-pool",machine_type="n1-standard-16",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-standard",}]' \
-var='gpu_pools=[{initial_node_count=2,name="gpu-pool",machine_type="g2-standard-24",autoscaling=true,min_count=1,max_count=3,disk_size_gb=100,disk_type="pd-balanced",accelerator_count=2,accelerator_type="nvidia-l4",gpu_driver_version="DEFAULT",}]' \
-auto-approve -no-color
echo "pass" > /workspace/gke_cluster_result.txt
dir: 'infrastructure/'
allowFailure: true
waitFor: ['validate platform', 'validate hf']

- id: 'test hg-tgi'
genlu2011 marked this conversation as resolved.
Show resolved Hide resolved
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'sh'
args:
- '-c'
- |
set -e

# Get kube config
gcloud container clusters get-credentials \
ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
--location $_REGION \
--project $PROJECT_ID

terraform apply \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=location=$_REGION \
-var=namespace=ml-$SHORT_SHA-$_BUILD_ID-hf \
-var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-auto-approve -no-color

# Make sure pods are running
kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID-hf --for=condition=Ready --timeout=1200s
kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-hf service/mistral-7b-instruct-service 8080:80 &
# Wait port-forwarding to take its place
sleep 10s

curl 127.0.0.1:8080/generate -X POST \
-H 'Content-Type: application/json' \
--data '{"inputs": "[INST]Hello world![/INST]","parameters": {"max_new_tokens": 400}}'
echo "pass" > /workspace/hf_result.txt
allowFailure: true
dir: 'tutorials-and-examples/hf-tgi/'
waitFor: ['create gke cluster']

- id: 'cleanup hf-tgi'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
set -e

cd /workspace/tutorials-and-examples/hf-tgi/

terraform destroy \
-var-file=workloads.tfvars \
-var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=location=$_REGION \
-var=namespace=ml-$SHORT_SHA-$_BUILD_ID-hf \
-var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-auto-approve -no-color
allowFailure: true
waitFor: ['test hg-tgi']

- id: 'cleanup gke cluster'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
set -e

cd /workspace/infrastructure

terraform destroy -var-file=tfvars_tests/standard-gke-public.platform.tfvars -var=project_id=$PROJECT_ID \
-var=cluster_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-cluster \
-var=network_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
-var=subnetwork_name=ml-$SHORT_SHA-$_PR_NUMBER-$_BUILD_ID-$_AUTOPILOT_CLUSTER \
-var=autopilot_cluster=$_AUTOPILOT_CLUSTER \
-var=cluster_location=$_REGION -auto-approve -no-color

allowFailure: true
waitFor: ['cleanup hf-tgi']

- id: 'check result'
name: 'gcr.io/$PROJECT_ID/terraform'
entrypoint: 'bash'
args:
- '-c'
- |
if [[ $(cat /workspace/gke_cluster_result.txt) != "pass" ]]; then
echo "gke cluster creation failed"
exit 1
fi

if [[ $(cat /workspace/hf_result.txt) != "pass" ]]; then
echo "hf-gti test failed"
exit 1
fi
waitFor: ['cleanup gke cluster']

substitutions:
_REGION: us-east4
_USER_NAME: github
_AUTOPILOT_CLUSTER: "false"
_BUILD_ID: ${BUILD_ID:0:8}
options:
substitutionOption: 'ALLOW_LOOSE'
machineType: 'E2_HIGHCPU_8'
timeout: 5400s
8 changes: 8 additions & 0 deletions tutorials-and-examples/hf-tgi/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,18 @@ provider "helm" {
}
}

module "namespace" {
source = "../../modules/kubernetes-namespace"
create_namespace = true
namespace = var.namespace
}

module "inference-server" {
source = "../../modules/inference-service"
namespace = var.namespace
additional_labels = var.additional_labels
autopilot_cluster = var.autopilot_cluster
depends_on = [module.namespace]
}

resource "helm_release" "gmp-engine" {
Expand All @@ -68,4 +75,5 @@ resource "helm_release" "gmp-engine" {
values = [
"${file("${path.module}/podmonitoring.yaml")}"
]
depends_on = [module.namespace]
}
6 changes: 6 additions & 0 deletions tutorials-and-examples/hf-tgi/workloads.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
project_id = "<your project ID>"
cluster_name = "l4-demo"
location = "us-central1"

namespace = "l4-demo"
autopilot_cluster = false
Loading