diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py index 62b317b68..b1e5a86a6 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py +++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py @@ -58,7 +58,7 @@ def generate_request(prompt): "temperature": 0.0 if use_beam_search else 1.0, "top_p": 1.0, "max_tokens": output_len, - "ignore_eos": True, + "ignore_eos": False, "stream": False, } elif backend == "tgi": @@ -114,7 +114,10 @@ def get_token_count(prompt, resp): number_of_output_tokens = 0 if backend == "vllm": - number_of_output_tokens = 0 # to be added + resp_dict = json.loads(resp.content.decode('utf-8')) + total_tokens = len( + tokenizer.encode(resp_dict["text"][0])) + number_of_output_tokens = total_tokens - number_of_input_tokens elif backend == "tgi": resp_dict = json.loads(resp.content.decode('utf-8')) number_of_output_tokens = len( diff --git a/benchmarks/inference-server/text-generation-inference/secret-templates/secret-provider.tftpl b/benchmarks/inference-server/templates/secret-templates/secret-provider.tftpl similarity index 100% rename from benchmarks/inference-server/text-generation-inference/secret-templates/secret-provider.tftpl rename to benchmarks/inference-server/templates/secret-templates/secret-provider.tftpl diff --git a/benchmarks/inference-server/text-generation-inference/main.tf b/benchmarks/inference-server/text-generation-inference/main.tf index b35f05869..c094b1b14 100644 --- a/benchmarks/inference-server/text-generation-inference/main.tf +++ b/benchmarks/inference-server/text-generation-inference/main.tf @@ -43,7 +43,7 @@ locals { ) secret_templates_path = ( var.secret_templates_path == null - ? "${path.module}/secret-templates" + ? "${path.module}/../templates/secret-templates" : pathexpand(var.secret_templates_path) ) hugging_face_token_secret = ( diff --git a/benchmarks/inference-server/vllm/README.md b/benchmarks/inference-server/vllm/README.md new file mode 100644 index 000000000..a137f73ba --- /dev/null +++ b/benchmarks/inference-server/vllm/README.md @@ -0,0 +1,139 @@ +# Benchmark vLLM on GKE + +This stage deploys the [vLLM](https://docs.vllm.ai/en/latest/index.html) inference server. + +## Instructions + +### Step 1: create and configure terraform.tfvars + +Create a `terraform.tfvars` file. `sample-terraform.tfvars` is provided as an +example file. You can copy the file as a starting point. Note that you will have +to change the existing `credentials_config`. + +```bash +cp sample-terraform.tfvars terraform.tfvars +``` + +Fill out your `terraform.tfvars` with the desired model and server +configuration, referring to the list of required and optional variables +[here](#inputs). Variables `credentials_config` are required. + +#### Determine number of gpus + +`gpu_count` should be configured respective to the size of the model with some +overhead for the kv cache. Here's an example on figuring out how many GPUs you +need to run a model: + +vLLM defaults to bfloat16 for running supported models on GPUs. For a model with +dtype of FP16 or bfloat16, each parameter requires 16 bits. A 7 billion +parameter model requires a minimum of 7 billion * 16 bits = 14 GB of GPU memory. +A single L4 GPU has 24GB of GPU memory. 1 L4 GPU is sufficient to run the +`tiiuae/falcon-7b` model with plenty of overhead for the kv cache. + +Note that vLLM server supports gpu_count equal to one of 1, 2, 4, or 8. + +#### [optional] set-up credentials config with kubeconfig + +If you created your cluster with steps from `../../infra/` or with fleet +management enabled, the existing `credentials_config` must use the fleet host +credentials like this: + +```bash +credentials_config = { + fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/$CLUSTER_NAME" +} +``` + +If you created your own cluster without fleet management enabled, you can use +your cluster's kubeconfig in the `credentials_config`. You must isolate your +cluster's kubeconfig from other clusters in the default kube.config file. To do +this, run the following command: + +```bash +KUBECONFIG=~/.kube/${CLUSTER_NAME}-kube.config gcloud container clusters get-credentials $CLUSTER_NAME --location $CLUSTER_LOCATION +``` + +Then update your `terraform.tfvars` `credentials_config` to the following: + +```bash +credentials_config = { + kubeconfig = { + path = "~/.kube/${CLUSTER_NAME}-kube.config" + } +} +``` + +#### [optional] set up secret token in Secret Manager + +A model may require a security token to access it. For example, Llama2 from +HuggingFace is a gated model that requires a +[user access token](https://huggingface.co/docs/hub/en/security-tokens). If the +model you want to run does not require this, skip this step. + +If you followed steps from `../../infra/`, Secret Manager and the user access +token should already be set up. If not, it is strongly recommended that you use +Workload Identity and Secret Manager to access the user access tokens to avoid +adding a plain text token into the terraform state. To do so, follow the +instructions for [setting up a secret in Secret Manager here](https://cloud.google.com/kubernetes-engine/docs/tutorials/workload-identity-secrets). + +Once complete, you should add these related secret values to your +`terraform.tfvars`: + +```bash +# ex. "projects/sample-project/secrets/hugging_face_secret" +hugging_face_secret = $SECRET_ID + # ex. 1 +hugging_face_secret_version = $SECRET_VERSION +``` + +### [Optional] Step 2: configure alternative storage + +By default, the vLLM yaml spec assumes that the cluster has [local SSD-backed ephemeral storage](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/local-ssd) +available. + +If you wish to use a different storage option with the vLLM server, you can edit +the `./manifest-templates/text-generation-inference.tftpl` directly with your +desired storage setup. + + +### Step 3: login to gcloud + +Run the following gcloud command for authorization: + +```bash +gcloud auth application-default login +``` + +### Step 4: terraform initialize, plan and apply + +Run the following terraform commands: + +```bash +# initialize terraform +terraform init + +# verify changes +terraform plan + +# apply changes +terraform apply +``` + + + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [credentials\_config](#input\_credentials\_config) | Configure how Terraform authenticates to the cluster. |
object({| n/a | yes | +| [gpu\_count](#input\_gpu\_count) | Parallelism based on number of gpus. | `number` | `1` | no | +| [hugging\_face\_secret](#input\_hugging\_face\_secret) | Secret id in Secret Manager | `string` | `null` | no | +| [hugging\_face\_secret\_version](#input\_hugging\_face\_secret\_version) | Secret version in Secret Manager | `string` | `null` | no | +| [ksa](#input\_ksa) | Kubernetes Service Account used for workload. | `string` | `"default"` | no | +| [model\_id](#input\_model\_id) | Model used for inference. | `string` | `"tiiuae/falcon-7b"` | no | +| [namespace](#input\_namespace) | Namespace used for vLLM resources. | `string` | `"default"` | no | +| [project\_id](#input\_project\_id) | Project id of existing or created project. | `string` | n/a | yes | +| [secret\_templates\_path](#input\_secret\_templates\_path) | Path where secret configuration manifest templates will be read from. Set to null to use the default manifests | `string` | `null` | no | +| [templates\_path](#input\_templates\_path) | Path where manifest templates will be read from. Set to null to use the default manifests | `string` | `null` | no | + + \ No newline at end of file diff --git a/benchmarks/inference-server/vllm/main.tf b/benchmarks/inference-server/vllm/main.tf new file mode 100644 index 000000000..f34e3eec9 --- /dev/null +++ b/benchmarks/inference-server/vllm/main.tf @@ -0,0 +1,60 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + + all_templates = concat(local.wl_templates, local.secret_templates) + + wl_templates = [ + for f in fileset(local.wl_templates_path, "*tftpl") : + "${local.wl_templates_path}/${f}" + ] + + secret_templates = local.hugging_face_token_secret == null ? [] : [ + for f in fileset(local.secret_templates_path, "*tftpl") : + "${local.secret_templates_path}/${f}" + ] + + wl_templates_path = ( + var.templates_path == null + ? "${path.module}/manifest-templates" + : pathexpand(var.templates_path) + ) + secret_templates_path = ( + var.secret_templates_path == null + ? "${path.module}/../templates/secret-templates" + : pathexpand(var.secret_templates_path) + ) + hugging_face_token_secret = ( + var.hugging_face_secret == null || var.hugging_face_secret_version == null + ? null + : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}" + ) +} + +resource "kubernetes_manifest" "default" { + for_each = toset(local.all_templates) + manifest = yamldecode(templatefile(each.value, { + namespace = var.namespace + model_id = var.model_id + gpu_count = var.gpu_count + ksa = var.ksa + hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret] + })) + timeouts { + create = "60m" + } +} diff --git a/benchmarks/inference-server/vllm/manifest-templates/vllm-service.tftpl b/benchmarks/inference-server/vllm/manifest-templates/vllm-service.tftpl new file mode 100644 index 000000000..d6cab03e5 --- /dev/null +++ b/benchmarks/inference-server/vllm/manifest-templates/vllm-service.tftpl @@ -0,0 +1,29 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: vllm + namespace: ${namespace} + labels: + app: vllm +spec: + type: LoadBalancer + ports: + - port: 80 + targetPort: 80 + protocol: TCP + selector: + app: vllm diff --git a/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl new file mode 100644 index 000000000..1de1b1f4e --- /dev/null +++ b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl @@ -0,0 +1,81 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm + namespace: ${namespace} + labels: + app: vllm +spec: + selector: + matchLabels: + app: vllm + template: + metadata: + labels: + app: vllm + ai.gke.io/inference-server: vllm + examples.ai.gke.io/source: ai-on-gke-benchmarks + spec: + serviceAccountName: ${ksa} + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 1Gi +%{ for hugging_face_token_secret in hugging_face_token_secret_list ~} + - name: hftoken + csi: + driver: secrets-store.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: "gcp-secret-provider" +%{ endfor ~} + - name: data + hostPath: + path: /mnt/stateful_partition/kube-ephemeral-ssd/data + containers: + - name: vllm + ports: + - containerPort: 80 + image: "vllm/vllm-openai:v0.3.3" + command: ["python3", "-m", "vllm.entrypoints.api_server"] + args: ["--model", "${model_id}", "--tensor-parallel-size", "${gpu_count}", "--port", "80"] + env: + - name: PORT + value: 80 +%{ for hugging_face_token_secret in hugging_face_token_secret_list ~} + - name: HUGGING_FACE_HUB_TOKEN # Related token consumption + valueFrom: + secretKeyRef: + name: hf-token + key: HF_TOKEN +%{ endfor ~} + resources: + limits: + nvidia.com/gpu: ${gpu_count} # number of gpu's allocated to workload + requests: + cpu: "1" + serviceAccountName: ${ksa} + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /data + name: data +%{ for hugging_face_token_secret in hugging_face_token_secret_list ~} + - mountPath: "/var/secrets" + name: hftoken +%{ endfor ~} \ No newline at end of file diff --git a/benchmarks/inference-server/vllm/providers.tf b/benchmarks/inference-server/vllm/providers.tf new file mode 100644 index 000000000..70c82e817 --- /dev/null +++ b/benchmarks/inference-server/vllm/providers.tf @@ -0,0 +1,36 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +data "google_client_config" "identity" { + count = var.credentials_config.fleet_host != null ? 1 : 0 +} + +provider "kubernetes" { + config_path = ( + var.credentials_config.kubeconfig == null + ? null + : pathexpand(var.credentials_config.kubeconfig.path) + ) + config_context = try( + var.credentials_config.kubeconfig.context, null + ) + host = ( + var.credentials_config.fleet_host == null + ? null + : var.credentials_config.fleet_host + ) + token = try(data.google_client_config.identity.0.access_token, null) +} diff --git a/benchmarks/inference-server/vllm/sample-terraform.tfvars b/benchmarks/inference-server/vllm/sample-terraform.tfvars new file mode 100644 index 000000000..51d0cf466 --- /dev/null +++ b/benchmarks/inference-server/vllm/sample-terraform.tfvars @@ -0,0 +1,9 @@ +credentials_config = { + fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-benchmark" +} + +namespace = "benchmark" +ksa = "benchmark-ksa" +model_id = "tiiuae/falcon-7b" +gpu_count = 1 +project_id = "
fleet_host = optional(string)
kubeconfig = optional(object({
context = optional(string)
path = optional(string, "~/.kube/config")
}))
})