From f3e12b32eaa8efd36f3357e93f980352ea7e92f1 Mon Sep 17 00:00:00 2001 From: Ming Zhu Date: Wed, 5 Jun 2024 10:21:33 -0700 Subject: [PATCH] Support vllm openai api server (#694) * Support vllm openai api server * make terraform link happy --------- Co-authored-by: Ming Zhu --- benchmarks/inference-server/vllm/main.tf | 1 + .../vllm/manifest-templates/vllm.tftpl | 4 ++-- benchmarks/inference-server/vllm/variables.tf | 11 +++++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/benchmarks/inference-server/vllm/main.tf b/benchmarks/inference-server/vllm/main.tf index f34e3eec9..4161e0347 100644 --- a/benchmarks/inference-server/vllm/main.tf +++ b/benchmarks/inference-server/vllm/main.tf @@ -51,6 +51,7 @@ resource "kubernetes_manifest" "default" { namespace = var.namespace model_id = var.model_id gpu_count = var.gpu_count + swap_space = var.swap_space ksa = var.ksa hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret] })) diff --git a/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl index 1de1b1f4e..993f577b4 100644 --- a/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl +++ b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl @@ -52,8 +52,8 @@ spec: ports: - containerPort: 80 image: "vllm/vllm-openai:v0.3.3" - command: ["python3", "-m", "vllm.entrypoints.api_server"] - args: ["--model", "${model_id}", "--tensor-parallel-size", "${gpu_count}", "--port", "80"] + command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args: ["--model", "${model_id}", "--tensor-parallel-size", "${gpu_count}", "--port", "80", "--swap-space", "${swap_space}", "--disable-log-requests"] env: - name: PORT value: 80 diff --git a/benchmarks/inference-server/vllm/variables.tf b/benchmarks/inference-server/vllm/variables.tf index 79455ca03..ee8e4428b 100644 --- a/benchmarks/inference-server/vllm/variables.tf +++ b/benchmarks/inference-server/vllm/variables.tf @@ -58,6 +58,17 @@ variable "gpu_count" { } } +variable "swap_space" { + description = "The size (GiB) of CPU memory per GPU to use as swap space. See https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py#L65 for more details." + type = number + nullable = false + default = 4 + validation { + condition = var.swap_space >= 0 + error_message = "swap space must be greater than or equal to 0." + } +} + variable "ksa" { description = "Kubernetes Service Account used for workload." type = string