From 1ec6e8ebe162a5b8fc67fb60d93facafba8c2950 Mon Sep 17 00:00:00 2001
From: Ashok Chandrasekar <achandrasekar@google.com>
Date: Mon, 8 Apr 2024 13:03:12 -0700
Subject: [PATCH] Add vLLM model server support for inference benchmarks (#450)

This change adds vllm as a supported model server for benchmarking. The
change includes:

1. vLLM automation through Terraform to different models with various
   configurations.
2. vLLM as a backend in the benchmarking tool.
3. vLLM request/response parsing to accurately measure throughput tokens
   and latency.
---
 .../locust-docker/locust-tasks/tasks.py       |   7 +-
 .../secret-templates/secret-provider.tftpl    |   0
 .../text-generation-inference/main.tf         |   2 +-
 benchmarks/inference-server/vllm/README.md    | 139 ++++++++++++++++++
 benchmarks/inference-server/vllm/main.tf      |  60 ++++++++
 .../manifest-templates/vllm-service.tftpl     |  29 ++++
 .../vllm/manifest-templates/vllm.tftpl        |  81 ++++++++++
 benchmarks/inference-server/vllm/providers.tf |  36 +++++
 .../vllm/sample-terraform.tfvars              |   9 ++
 benchmarks/inference-server/vllm/variables.tf |  97 ++++++++++++
 10 files changed, 457 insertions(+), 3 deletions(-)
 rename benchmarks/inference-server/{text-generation-inference => templates}/secret-templates/secret-provider.tftpl (100%)
 create mode 100644 benchmarks/inference-server/vllm/README.md
 create mode 100644 benchmarks/inference-server/vllm/main.tf
 create mode 100644 benchmarks/inference-server/vllm/manifest-templates/vllm-service.tftpl
 create mode 100644 benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl
 create mode 100644 benchmarks/inference-server/vllm/providers.tf
 create mode 100644 benchmarks/inference-server/vllm/sample-terraform.tfvars
 create mode 100644 benchmarks/inference-server/vllm/variables.tf

diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
index 62b317b68..b1e5a86a6 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
@@ -58,7 +58,7 @@ def generate_request(prompt):
             "temperature": 0.0 if use_beam_search else 1.0,
             "top_p": 1.0,
             "max_tokens": output_len,
-            "ignore_eos": True,
+            "ignore_eos": False,
             "stream": False,
         }
     elif backend == "tgi":
@@ -114,7 +114,10 @@ def get_token_count(prompt, resp):
     number_of_output_tokens = 0
 
     if backend == "vllm":
-        number_of_output_tokens = 0  # to be added
+        resp_dict = json.loads(resp.content.decode('utf-8'))
+        total_tokens = len(
+            tokenizer.encode(resp_dict["text"][0]))
+        number_of_output_tokens = total_tokens - number_of_input_tokens
     elif backend == "tgi":
         resp_dict = json.loads(resp.content.decode('utf-8'))
         number_of_output_tokens = len(
diff --git a/benchmarks/inference-server/text-generation-inference/secret-templates/secret-provider.tftpl b/benchmarks/inference-server/templates/secret-templates/secret-provider.tftpl
similarity index 100%
rename from benchmarks/inference-server/text-generation-inference/secret-templates/secret-provider.tftpl
rename to benchmarks/inference-server/templates/secret-templates/secret-provider.tftpl
diff --git a/benchmarks/inference-server/text-generation-inference/main.tf b/benchmarks/inference-server/text-generation-inference/main.tf
index b35f05869..c094b1b14 100644
--- a/benchmarks/inference-server/text-generation-inference/main.tf
+++ b/benchmarks/inference-server/text-generation-inference/main.tf
@@ -43,7 +43,7 @@ locals {
   )
   secret_templates_path = (
     var.secret_templates_path == null
-    ? "${path.module}/secret-templates"
+    ? "${path.module}/../templates/secret-templates"
     : pathexpand(var.secret_templates_path)
   )
   hugging_face_token_secret = (
diff --git a/benchmarks/inference-server/vllm/README.md b/benchmarks/inference-server/vllm/README.md
new file mode 100644
index 000000000..a137f73ba
--- /dev/null
+++ b/benchmarks/inference-server/vllm/README.md
@@ -0,0 +1,139 @@
+# Benchmark vLLM on GKE
+
+This stage deploys the [vLLM](https://docs.vllm.ai/en/latest/index.html) inference server.
+
+## Instructions
+
+### Step 1: create and configure terraform.tfvars
+
+Create a `terraform.tfvars` file. `sample-terraform.tfvars` is provided as an
+example file. You can copy the file as a starting point. Note that you will have
+to change the existing `credentials_config`.
+
+```bash
+cp sample-terraform.tfvars terraform.tfvars
+```
+
+Fill out your `terraform.tfvars` with the desired model and server
+configuration, referring to the list of required and optional variables
+[here](#inputs). Variables `credentials_config` are required.
+
+#### Determine number of gpus
+
+`gpu_count` should be configured respective to the size of the model with some
+overhead for the kv cache. Here's an example on figuring out how many GPUs you
+need to run a model:
+
+vLLM defaults to bfloat16 for running supported models on GPUs. For a model with
+dtype of FP16 or bfloat16, each parameter requires 16 bits. A 7 billion
+parameter model requires a minimum of 7 billion * 16 bits = 14 GB of GPU memory.
+A single L4 GPU has 24GB of GPU memory. 1 L4 GPU is sufficient to run the
+`tiiuae/falcon-7b` model with plenty of overhead for the kv cache.
+
+Note that vLLM server supports gpu_count equal to one of 1, 2, 4, or 8.
+
+#### [optional] set-up credentials config with kubeconfig
+
+If you created your cluster with steps from `../../infra/` or with fleet
+management enabled, the existing `credentials_config` must use the fleet host
+credentials like this:
+
+```bash
+credentials_config = {
+  fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/$CLUSTER_NAME"
+}
+```
+
+If you created your own cluster without fleet management enabled, you can use
+your cluster's kubeconfig in the `credentials_config`. You must isolate your
+cluster's kubeconfig from other clusters in the default kube.config file. To do
+this, run the following command:
+
+```bash
+KUBECONFIG=~/.kube/${CLUSTER_NAME}-kube.config gcloud container clusters get-credentials $CLUSTER_NAME --location $CLUSTER_LOCATION
+```
+
+Then update your `terraform.tfvars` `credentials_config` to the following:
+
+```bash
+credentials_config = {
+  kubeconfig = {
+    path = "~/.kube/${CLUSTER_NAME}-kube.config"
+  }
+}
+```
+
+#### [optional] set up secret token in Secret Manager
+
+A model may require a security token to access it. For example, Llama2 from
+HuggingFace is a gated model that requires a
+[user access token](https://huggingface.co/docs/hub/en/security-tokens). If the
+model you want to run does not require this, skip this step.
+
+If you followed steps from `../../infra/`, Secret Manager and the user access
+token should already be set up. If not, it is strongly recommended that you use
+Workload Identity and Secret Manager to access the user access tokens to avoid
+adding a plain text token into the terraform state. To do so, follow the
+instructions for [setting up a secret in Secret Manager here](https://cloud.google.com/kubernetes-engine/docs/tutorials/workload-identity-secrets).
+
+Once complete, you should add these related secret values to your
+`terraform.tfvars`:
+
+```bash
+# ex. "projects/sample-project/secrets/hugging_face_secret"
+hugging_face_secret = $SECRET_ID
+ # ex. 1
+hugging_face_secret_version =  $SECRET_VERSION
+```
+
+### [Optional] Step 2: configure alternative storage
+
+By default, the vLLM yaml spec assumes that the cluster has [local SSD-backed ephemeral storage](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/local-ssd)
+available.
+
+If you wish to use a different storage option with the vLLM server, you can edit
+the `./manifest-templates/text-generation-inference.tftpl` directly with your
+desired storage setup.
+
+
+### Step 3: login to gcloud
+
+Run the following gcloud command for authorization:
+
+```bash
+gcloud auth application-default login
+```
+
+### Step 4: terraform initialize, plan and apply
+
+Run the following terraform commands:
+
+```bash
+# initialize terraform
+terraform init
+
+# verify changes
+terraform plan
+
+# apply changes
+terraform apply
+```
+
+<!-- BEGIN_TF_DOCS -->
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| <a name="input_credentials_config"></a> [credentials\_config](#input\_credentials\_config) | Configure how Terraform authenticates to the cluster. | <pre>object({<br>    fleet_host = optional(string)<br>    kubeconfig = optional(object({<br>      context = optional(string)<br>      path    = optional(string, "~/.kube/config")<br>    }))<br>  })</pre> | n/a | yes |
+| <a name="input_gpu_count"></a> [gpu\_count](#input\_gpu\_count) | Parallelism based on number of gpus. | `number` | `1` | no |
+| <a name="input_hugging_face_secret"></a> [hugging\_face\_secret](#input\_hugging\_face\_secret) | Secret id in Secret Manager | `string` | `null` | no |
+| <a name="input_hugging_face_secret_version"></a> [hugging\_face\_secret\_version](#input\_hugging\_face\_secret\_version) | Secret version in Secret Manager | `string` | `null` | no |
+| <a name="input_ksa"></a> [ksa](#input\_ksa) | Kubernetes Service Account used for workload. | `string` | `"default"` | no |
+| <a name="input_model_id"></a> [model\_id](#input\_model\_id) | Model used for inference. | `string` | `"tiiuae/falcon-7b"` | no |
+| <a name="input_namespace"></a> [namespace](#input\_namespace) | Namespace used for vLLM resources. | `string` | `"default"` | no |
+| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project id of existing or created project. | `string` | n/a | yes |
+| <a name="input_secret_templates_path"></a> [secret\_templates\_path](#input\_secret\_templates\_path) | Path where secret configuration manifest templates will be read from. Set to null to use the default manifests | `string` | `null` | no |
+| <a name="input_templates_path"></a> [templates\_path](#input\_templates\_path) | Path where manifest templates will be read from. Set to null to use the default manifests | `string` | `null` | no |
+
+<!-- END_TF_DOCS -->
\ No newline at end of file
diff --git a/benchmarks/inference-server/vllm/main.tf b/benchmarks/inference-server/vllm/main.tf
new file mode 100644
index 000000000..f34e3eec9
--- /dev/null
+++ b/benchmarks/inference-server/vllm/main.tf
@@ -0,0 +1,60 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+locals {
+
+  all_templates = concat(local.wl_templates, local.secret_templates)
+
+  wl_templates = [
+    for f in fileset(local.wl_templates_path, "*tftpl") :
+    "${local.wl_templates_path}/${f}"
+  ]
+
+  secret_templates = local.hugging_face_token_secret == null ? [] : [
+    for f in fileset(local.secret_templates_path, "*tftpl") :
+    "${local.secret_templates_path}/${f}"
+  ]
+
+  wl_templates_path = (
+    var.templates_path == null
+    ? "${path.module}/manifest-templates"
+    : pathexpand(var.templates_path)
+  )
+  secret_templates_path = (
+    var.secret_templates_path == null
+    ? "${path.module}/../templates/secret-templates"
+    : pathexpand(var.secret_templates_path)
+  )
+  hugging_face_token_secret = (
+    var.hugging_face_secret == null || var.hugging_face_secret_version == null
+    ? null
+    : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}"
+  )
+}
+
+resource "kubernetes_manifest" "default" {
+  for_each = toset(local.all_templates)
+  manifest = yamldecode(templatefile(each.value, {
+    namespace                      = var.namespace
+    model_id                       = var.model_id
+    gpu_count                      = var.gpu_count
+    ksa                            = var.ksa
+    hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
+  }))
+  timeouts {
+    create = "60m"
+  }
+}
diff --git a/benchmarks/inference-server/vllm/manifest-templates/vllm-service.tftpl b/benchmarks/inference-server/vllm/manifest-templates/vllm-service.tftpl
new file mode 100644
index 000000000..d6cab03e5
--- /dev/null
+++ b/benchmarks/inference-server/vllm/manifest-templates/vllm-service.tftpl
@@ -0,0 +1,29 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm
+  namespace: ${namespace}
+  labels:
+    app: vllm
+spec:
+  type: LoadBalancer
+  ports:
+    - port: 80
+      targetPort: 80
+      protocol: TCP
+  selector:
+    app: vllm
diff --git a/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl
new file mode 100644
index 000000000..1de1b1f4e
--- /dev/null
+++ b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl
@@ -0,0 +1,81 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm
+  namespace: ${namespace}
+  labels:
+    app: vllm
+spec:
+  selector:
+    matchLabels:
+      app: vllm
+  template:
+    metadata:
+      labels:
+        app: vllm
+        ai.gke.io/inference-server: vllm
+        examples.ai.gke.io/source: ai-on-gke-benchmarks
+    spec:
+      serviceAccountName: ${ksa}
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 1Gi
+%{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
+        - name: hftoken
+          csi:
+            driver: secrets-store.csi.k8s.io
+            readOnly: true
+            volumeAttributes:
+              secretProviderClass: "gcp-secret-provider"
+%{ endfor ~}
+        - name: data
+          hostPath:
+            path: /mnt/stateful_partition/kube-ephemeral-ssd/data
+      containers:
+        - name: vllm
+          ports:
+            - containerPort: 80
+          image: "vllm/vllm-openai:v0.3.3"
+          command: ["python3", "-m", "vllm.entrypoints.api_server"]
+          args: ["--model", "${model_id}", "--tensor-parallel-size", "${gpu_count}", "--port", "80"]
+          env:
+            - name: PORT
+              value: 80
+%{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
+            - name: HUGGING_FACE_HUB_TOKEN # Related token consumption
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: HF_TOKEN
+%{ endfor ~}
+          resources:
+            limits:
+              nvidia.com/gpu: ${gpu_count} # number of gpu's allocated to workload
+            requests:
+              cpu: "1"
+          serviceAccountName: ${ksa}
+          volumeMounts:
+            - mountPath: /dev/shm
+              name: dshm
+            - mountPath: /data
+              name: data
+%{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
+            - mountPath: "/var/secrets"
+              name: hftoken
+%{ endfor ~}
\ No newline at end of file
diff --git a/benchmarks/inference-server/vllm/providers.tf b/benchmarks/inference-server/vllm/providers.tf
new file mode 100644
index 000000000..70c82e817
--- /dev/null
+++ b/benchmarks/inference-server/vllm/providers.tf
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+data "google_client_config" "identity" {
+  count = var.credentials_config.fleet_host != null ? 1 : 0
+}
+
+provider "kubernetes" {
+  config_path = (
+    var.credentials_config.kubeconfig == null
+    ? null
+    : pathexpand(var.credentials_config.kubeconfig.path)
+  )
+  config_context = try(
+    var.credentials_config.kubeconfig.context, null
+  )
+  host = (
+    var.credentials_config.fleet_host == null
+    ? null
+    : var.credentials_config.fleet_host
+  )
+  token = try(data.google_client_config.identity.0.access_token, null)
+}
diff --git a/benchmarks/inference-server/vllm/sample-terraform.tfvars b/benchmarks/inference-server/vllm/sample-terraform.tfvars
new file mode 100644
index 000000000..51d0cf466
--- /dev/null
+++ b/benchmarks/inference-server/vllm/sample-terraform.tfvars
@@ -0,0 +1,9 @@
+credentials_config = {
+  fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-benchmark"
+}
+
+namespace  = "benchmark"
+ksa        = "benchmark-ksa"
+model_id   = "tiiuae/falcon-7b"
+gpu_count  = 1
+project_id = "<project_id>"
diff --git a/benchmarks/inference-server/vllm/variables.tf b/benchmarks/inference-server/vllm/variables.tf
new file mode 100644
index 000000000..79455ca03
--- /dev/null
+++ b/benchmarks/inference-server/vllm/variables.tf
@@ -0,0 +1,97 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+variable "credentials_config" {
+  description = "Configure how Terraform authenticates to the cluster."
+  type = object({
+    fleet_host = optional(string)
+    kubeconfig = optional(object({
+      context = optional(string)
+      path    = optional(string, "~/.kube/config")
+    }))
+  })
+  nullable = false
+  validation {
+    condition = (
+      (var.credentials_config.fleet_host != null) !=
+      (var.credentials_config.kubeconfig != null)
+    )
+    error_message = "Exactly one of fleet host or kubeconfig must be set."
+  }
+}
+
+variable "namespace" {
+  description = "Namespace used for vLLM resources."
+  type        = string
+  nullable    = false
+  default     = "default"
+}
+
+variable "model_id" {
+  description = "Model used for inference."
+  type        = string
+  nullable    = false
+  default     = "tiiuae/falcon-7b"
+}
+
+variable "gpu_count" {
+  description = "Parallelism based on number of gpus."
+  type        = number
+  nullable    = false
+  default     = 1
+  validation {
+    condition     = var.gpu_count == 1 || var.gpu_count == 2 || var.gpu_count == 4 || var.gpu_count == 8
+    error_message = "vLLM server supports gpu_count equal to one of 1, 2, 4, or 8."
+  }
+}
+
+variable "ksa" {
+  description = "Kubernetes Service Account used for workload."
+  type        = string
+  nullable    = false
+  default     = "default"
+}
+
+variable "templates_path" {
+  description = "Path where manifest templates will be read from. Set to null to use the default manifests"
+  type        = string
+  default     = null
+}
+
+variable "secret_templates_path" {
+  description = "Path where secret configuration manifest templates will be read from. Set to null to use the default manifests"
+  type        = string
+  default     = null
+}
+
+variable "hugging_face_secret" {
+  description = "Secret id in Secret Manager"
+  type        = string
+  nullable    = true
+  default     = null
+}
+
+variable "hugging_face_secret_version" {
+  description = "Secret version in Secret Manager"
+  type        = string
+  nullable    = true
+  default     = null
+}
+
+variable "project_id" {
+  description = "Project id of existing or created project."
+  type        = string
+}