diff --git a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt index 739d46f7d..b477c334b 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt +++ b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt @@ -24,7 +24,7 @@ ninja # For faster builds. psutil ray >= 2.9 sentencepiece # Required for LLaMA tokenizer. -numpy +numpy < 2.0 torch == 2.1.1 transformers >= 4.37.0 # Required for Qwen2 xformers == 0.0.23 diff --git a/benchmarks/inference-server/vllm/main.tf b/benchmarks/inference-server/vllm/main.tf index 4161e0347..7627efa5e 100644 --- a/benchmarks/inference-server/vllm/main.tf +++ b/benchmarks/inference-server/vllm/main.tf @@ -43,6 +43,7 @@ locals { ? null : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}" ) + vllm_podmonitoring = "${path.module}/monitoring-templates/vllm-podmonitoring.yaml.tftpl" } resource "kubernetes_manifest" "default" { @@ -59,3 +60,9 @@ resource "kubernetes_manifest" "default" { create = "60m" } } + +resource "kubernetes_manifest" "vllm-pod-monitoring" { + manifest = yamldecode(templatefile(local.vllm_podmonitoring, { + namespace = var.namespace + })) +} diff --git a/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl index 169ec4356..336c45167 100644 --- a/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl +++ b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl @@ -55,7 +55,7 @@ spec: command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: ["--model", "${model_id}", "--tensor-parallel-size", "${gpu_count}", "--port", "80", "--swap-space", "${swap_space}", "--disable-log-requests"] env: - - name: PORT + - name: VLLM_PORT value: 80 %{ for hugging_face_token_secret in hugging_face_token_secret_list ~} - name: HUGGING_FACE_HUB_TOKEN # Related token consumption diff --git a/benchmarks/inference-server/vllm/monitoring-templates/vllm-podmonitoring.yaml.tftpl b/benchmarks/inference-server/vllm/monitoring-templates/vllm-podmonitoring.yaml.tftpl new file mode 100644 index 000000000..f582fc76e --- /dev/null +++ b/benchmarks/inference-server/vllm/monitoring-templates/vllm-podmonitoring.yaml.tftpl @@ -0,0 +1,12 @@ +apiVersion: monitoring.googleapis.com/v1 +kind: PodMonitoring +metadata: + name: "vllm-podmonitoring" + namespace: ${namespace} +spec: + selector: + matchLabels: + app: vllm + endpoints: + - port: 80 + interval: 15s