From 1ec6e8ebe162a5b8fc67fb60d93facafba8c2950 Mon Sep 17 00:00:00 2001 From: Ashok Chandrasekar Date: Mon, 8 Apr 2024 13:03:12 -0700 Subject: [PATCH] Add vLLM model server support for inference benchmarks (#450) This change adds vllm as a supported model server for benchmarking. The change includes: 1. vLLM automation through Terraform to different models with various configurations. 2. vLLM as a backend in the benchmarking tool. 3. vLLM request/response parsing to accurately measure throughput tokens and latency. --- .../locust-docker/locust-tasks/tasks.py | 7 +- .../secret-templates/secret-provider.tftpl | 0 .../text-generation-inference/main.tf | 2 +- benchmarks/inference-server/vllm/README.md | 139 ++++++++++++++++++ benchmarks/inference-server/vllm/main.tf | 60 ++++++++ .../manifest-templates/vllm-service.tftpl | 29 ++++ .../vllm/manifest-templates/vllm.tftpl | 81 ++++++++++ benchmarks/inference-server/vllm/providers.tf | 36 +++++ .../vllm/sample-terraform.tfvars | 9 ++ benchmarks/inference-server/vllm/variables.tf | 97 ++++++++++++ 10 files changed, 457 insertions(+), 3 deletions(-) rename benchmarks/inference-server/{text-generation-inference => templates}/secret-templates/secret-provider.tftpl (100%) create mode 100644 benchmarks/inference-server/vllm/README.md create mode 100644 benchmarks/inference-server/vllm/main.tf create mode 100644 benchmarks/inference-server/vllm/manifest-templates/vllm-service.tftpl create mode 100644 benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl create mode 100644 benchmarks/inference-server/vllm/providers.tf create mode 100644 benchmarks/inference-server/vllm/sample-terraform.tfvars create mode 100644 benchmarks/inference-server/vllm/variables.tf diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py index 62b317b68..b1e5a86a6 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py +++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py @@ -58,7 +58,7 @@ def generate_request(prompt): "temperature": 0.0 if use_beam_search else 1.0, "top_p": 1.0, "max_tokens": output_len, - "ignore_eos": True, + "ignore_eos": False, "stream": False, } elif backend == "tgi": @@ -114,7 +114,10 @@ def get_token_count(prompt, resp): number_of_output_tokens = 0 if backend == "vllm": - number_of_output_tokens = 0 # to be added + resp_dict = json.loads(resp.content.decode('utf-8')) + total_tokens = len( + tokenizer.encode(resp_dict["text"][0])) + number_of_output_tokens = total_tokens - number_of_input_tokens elif backend == "tgi": resp_dict = json.loads(resp.content.decode('utf-8')) number_of_output_tokens = len( diff --git a/benchmarks/inference-server/text-generation-inference/secret-templates/secret-provider.tftpl b/benchmarks/inference-server/templates/secret-templates/secret-provider.tftpl similarity index 100% rename from benchmarks/inference-server/text-generation-inference/secret-templates/secret-provider.tftpl rename to benchmarks/inference-server/templates/secret-templates/secret-provider.tftpl diff --git a/benchmarks/inference-server/text-generation-inference/main.tf b/benchmarks/inference-server/text-generation-inference/main.tf index b35f05869..c094b1b14 100644 --- a/benchmarks/inference-server/text-generation-inference/main.tf +++ b/benchmarks/inference-server/text-generation-inference/main.tf @@ -43,7 +43,7 @@ locals { ) secret_templates_path = ( var.secret_templates_path == null - ? "${path.module}/secret-templates" + ? "${path.module}/../templates/secret-templates" : pathexpand(var.secret_templates_path) ) hugging_face_token_secret = ( diff --git a/benchmarks/inference-server/vllm/README.md b/benchmarks/inference-server/vllm/README.md new file mode 100644 index 000000000..a137f73ba --- /dev/null +++ b/benchmarks/inference-server/vllm/README.md @@ -0,0 +1,139 @@ +# Benchmark vLLM on GKE + +This stage deploys the [vLLM](https://docs.vllm.ai/en/latest/index.html) inference server. + +## Instructions + +### Step 1: create and configure terraform.tfvars + +Create a `terraform.tfvars` file. `sample-terraform.tfvars` is provided as an +example file. You can copy the file as a starting point. Note that you will have +to change the existing `credentials_config`. + +```bash +cp sample-terraform.tfvars terraform.tfvars +``` + +Fill out your `terraform.tfvars` with the desired model and server +configuration, referring to the list of required and optional variables +[here](#inputs). Variables `credentials_config` are required. + +#### Determine number of gpus + +`gpu_count` should be configured respective to the size of the model with some +overhead for the kv cache. Here's an example on figuring out how many GPUs you +need to run a model: + +vLLM defaults to bfloat16 for running supported models on GPUs. For a model with +dtype of FP16 or bfloat16, each parameter requires 16 bits. A 7 billion +parameter model requires a minimum of 7 billion * 16 bits = 14 GB of GPU memory. +A single L4 GPU has 24GB of GPU memory. 1 L4 GPU is sufficient to run the +`tiiuae/falcon-7b` model with plenty of overhead for the kv cache. + +Note that vLLM server supports gpu_count equal to one of 1, 2, 4, or 8. + +#### [optional] set-up credentials config with kubeconfig + +If you created your cluster with steps from `../../infra/` or with fleet +management enabled, the existing `credentials_config` must use the fleet host +credentials like this: + +```bash +credentials_config = { + fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/$CLUSTER_NAME" +} +``` + +If you created your own cluster without fleet management enabled, you can use +your cluster's kubeconfig in the `credentials_config`. You must isolate your +cluster's kubeconfig from other clusters in the default kube.config file. To do +this, run the following command: + +```bash +KUBECONFIG=~/.kube/${CLUSTER_NAME}-kube.config gcloud container clusters get-credentials $CLUSTER_NAME --location $CLUSTER_LOCATION +``` + +Then update your `terraform.tfvars` `credentials_config` to the following: + +```bash +credentials_config = { + kubeconfig = { + path = "~/.kube/${CLUSTER_NAME}-kube.config" + } +} +``` + +#### [optional] set up secret token in Secret Manager + +A model may require a security token to access it. For example, Llama2 from +HuggingFace is a gated model that requires a +[user access token](https://huggingface.co/docs/hub/en/security-tokens). If the +model you want to run does not require this, skip this step. + +If you followed steps from `../../infra/`, Secret Manager and the user access +token should already be set up. If not, it is strongly recommended that you use +Workload Identity and Secret Manager to access the user access tokens to avoid +adding a plain text token into the terraform state. To do so, follow the +instructions for [setting up a secret in Secret Manager here](https://cloud.google.com/kubernetes-engine/docs/tutorials/workload-identity-secrets). + +Once complete, you should add these related secret values to your +`terraform.tfvars`: + +```bash +# ex. "projects/sample-project/secrets/hugging_face_secret" +hugging_face_secret = $SECRET_ID + # ex. 1 +hugging_face_secret_version = $SECRET_VERSION +``` + +### [Optional] Step 2: configure alternative storage + +By default, the vLLM yaml spec assumes that the cluster has [local SSD-backed ephemeral storage](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/local-ssd) +available. + +If you wish to use a different storage option with the vLLM server, you can edit +the `./manifest-templates/text-generation-inference.tftpl` directly with your +desired storage setup. + + +### Step 3: login to gcloud + +Run the following gcloud command for authorization: + +```bash +gcloud auth application-default login +``` + +### Step 4: terraform initialize, plan and apply + +Run the following terraform commands: + +```bash +# initialize terraform +terraform init + +# verify changes +terraform plan + +# apply changes +terraform apply +``` + + + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [credentials\_config](#input\_credentials\_config) | Configure how Terraform authenticates to the cluster. |
object({
fleet_host = optional(string)
kubeconfig = optional(object({
context = optional(string)
path = optional(string, "~/.kube/config")
}))
})
| n/a | yes | +| [gpu\_count](#input\_gpu\_count) | Parallelism based on number of gpus. | `number` | `1` | no | +| [hugging\_face\_secret](#input\_hugging\_face\_secret) | Secret id in Secret Manager | `string` | `null` | no | +| [hugging\_face\_secret\_version](#input\_hugging\_face\_secret\_version) | Secret version in Secret Manager | `string` | `null` | no | +| [ksa](#input\_ksa) | Kubernetes Service Account used for workload. | `string` | `"default"` | no | +| [model\_id](#input\_model\_id) | Model used for inference. | `string` | `"tiiuae/falcon-7b"` | no | +| [namespace](#input\_namespace) | Namespace used for vLLM resources. | `string` | `"default"` | no | +| [project\_id](#input\_project\_id) | Project id of existing or created project. | `string` | n/a | yes | +| [secret\_templates\_path](#input\_secret\_templates\_path) | Path where secret configuration manifest templates will be read from. Set to null to use the default manifests | `string` | `null` | no | +| [templates\_path](#input\_templates\_path) | Path where manifest templates will be read from. Set to null to use the default manifests | `string` | `null` | no | + + \ No newline at end of file diff --git a/benchmarks/inference-server/vllm/main.tf b/benchmarks/inference-server/vllm/main.tf new file mode 100644 index 000000000..f34e3eec9 --- /dev/null +++ b/benchmarks/inference-server/vllm/main.tf @@ -0,0 +1,60 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + + all_templates = concat(local.wl_templates, local.secret_templates) + + wl_templates = [ + for f in fileset(local.wl_templates_path, "*tftpl") : + "${local.wl_templates_path}/${f}" + ] + + secret_templates = local.hugging_face_token_secret == null ? [] : [ + for f in fileset(local.secret_templates_path, "*tftpl") : + "${local.secret_templates_path}/${f}" + ] + + wl_templates_path = ( + var.templates_path == null + ? "${path.module}/manifest-templates" + : pathexpand(var.templates_path) + ) + secret_templates_path = ( + var.secret_templates_path == null + ? "${path.module}/../templates/secret-templates" + : pathexpand(var.secret_templates_path) + ) + hugging_face_token_secret = ( + var.hugging_face_secret == null || var.hugging_face_secret_version == null + ? null + : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}" + ) +} + +resource "kubernetes_manifest" "default" { + for_each = toset(local.all_templates) + manifest = yamldecode(templatefile(each.value, { + namespace = var.namespace + model_id = var.model_id + gpu_count = var.gpu_count + ksa = var.ksa + hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret] + })) + timeouts { + create = "60m" + } +} diff --git a/benchmarks/inference-server/vllm/manifest-templates/vllm-service.tftpl b/benchmarks/inference-server/vllm/manifest-templates/vllm-service.tftpl new file mode 100644 index 000000000..d6cab03e5 --- /dev/null +++ b/benchmarks/inference-server/vllm/manifest-templates/vllm-service.tftpl @@ -0,0 +1,29 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: vllm + namespace: ${namespace} + labels: + app: vllm +spec: + type: LoadBalancer + ports: + - port: 80 + targetPort: 80 + protocol: TCP + selector: + app: vllm diff --git a/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl new file mode 100644 index 000000000..1de1b1f4e --- /dev/null +++ b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl @@ -0,0 +1,81 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: ${namespace} + labels: + app: vllm +spec: + selector: + matchLabels: + app: vllm + template: + metadata: + labels: + app: vllm + ai.gke.io/inference-server: vllm + examples.ai.gke.io/source: ai-on-gke-benchmarks + spec: + serviceAccountName: ${ksa} + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 1Gi +%{ for hugging_face_token_secret in hugging_face_token_secret_list ~} + - name: hftoken + csi: + driver: secrets-store.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: "gcp-secret-provider" +%{ endfor ~} + - name: data + hostPath: + path: /mnt/stateful_partition/kube-ephemeral-ssd/data + containers: + - name: vllm + ports: + - containerPort: 80 + image: "vllm/vllm-openai:v0.3.3" + command: ["python3", "-m", "vllm.entrypoints.api_server"] + args: ["--model", "${model_id}", "--tensor-parallel-size", "${gpu_count}", "--port", "80"] + env: + - name: PORT + value: 80 +%{ for hugging_face_token_secret in hugging_face_token_secret_list ~} + - name: HUGGING_FACE_HUB_TOKEN # Related token consumption + valueFrom: + secretKeyRef: + name: hf-token + key: HF_TOKEN +%{ endfor ~} + resources: + limits: + nvidia.com/gpu: ${gpu_count} # number of gpu's allocated to workload + requests: + cpu: "1" + serviceAccountName: ${ksa} + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /data + name: data +%{ for hugging_face_token_secret in hugging_face_token_secret_list ~} + - mountPath: "/var/secrets" + name: hftoken +%{ endfor ~} \ No newline at end of file diff --git a/benchmarks/inference-server/vllm/providers.tf b/benchmarks/inference-server/vllm/providers.tf new file mode 100644 index 000000000..70c82e817 --- /dev/null +++ b/benchmarks/inference-server/vllm/providers.tf @@ -0,0 +1,36 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +data "google_client_config" "identity" { + count = var.credentials_config.fleet_host != null ? 1 : 0 +} + +provider "kubernetes" { + config_path = ( + var.credentials_config.kubeconfig == null + ? null + : pathexpand(var.credentials_config.kubeconfig.path) + ) + config_context = try( + var.credentials_config.kubeconfig.context, null + ) + host = ( + var.credentials_config.fleet_host == null + ? null + : var.credentials_config.fleet_host + ) + token = try(data.google_client_config.identity.0.access_token, null) +} diff --git a/benchmarks/inference-server/vllm/sample-terraform.tfvars b/benchmarks/inference-server/vllm/sample-terraform.tfvars new file mode 100644 index 000000000..51d0cf466 --- /dev/null +++ b/benchmarks/inference-server/vllm/sample-terraform.tfvars @@ -0,0 +1,9 @@ +credentials_config = { + fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-benchmark" +} + +namespace = "benchmark" +ksa = "benchmark-ksa" +model_id = "tiiuae/falcon-7b" +gpu_count = 1 +project_id = "" diff --git a/benchmarks/inference-server/vllm/variables.tf b/benchmarks/inference-server/vllm/variables.tf new file mode 100644 index 000000000..79455ca03 --- /dev/null +++ b/benchmarks/inference-server/vllm/variables.tf @@ -0,0 +1,97 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "credentials_config" { + description = "Configure how Terraform authenticates to the cluster." + type = object({ + fleet_host = optional(string) + kubeconfig = optional(object({ + context = optional(string) + path = optional(string, "~/.kube/config") + })) + }) + nullable = false + validation { + condition = ( + (var.credentials_config.fleet_host != null) != + (var.credentials_config.kubeconfig != null) + ) + error_message = "Exactly one of fleet host or kubeconfig must be set." + } +} + +variable "namespace" { + description = "Namespace used for vLLM resources." + type = string + nullable = false + default = "default" +} + +variable "model_id" { + description = "Model used for inference." + type = string + nullable = false + default = "tiiuae/falcon-7b" +} + +variable "gpu_count" { + description = "Parallelism based on number of gpus." + type = number + nullable = false + default = 1 + validation { + condition = var.gpu_count == 1 || var.gpu_count == 2 || var.gpu_count == 4 || var.gpu_count == 8 + error_message = "vLLM server supports gpu_count equal to one of 1, 2, 4, or 8." + } +} + +variable "ksa" { + description = "Kubernetes Service Account used for workload." + type = string + nullable = false + default = "default" +} + +variable "templates_path" { + description = "Path where manifest templates will be read from. Set to null to use the default manifests" + type = string + default = null +} + +variable "secret_templates_path" { + description = "Path where secret configuration manifest templates will be read from. Set to null to use the default manifests" + type = string + default = null +} + +variable "hugging_face_secret" { + description = "Secret id in Secret Manager" + type = string + nullable = true + default = null +} + +variable "hugging_face_secret_version" { + description = "Secret version in Secret Manager" + type = string + nullable = true + default = null +} + +variable "project_id" { + description = "Project id of existing or created project." + type = string +}