add podmonitoring to vllm, small fix (#796)

* add podmonitoring to vllm, small fix * fix numpy error in logs, run terraform fmt
GoogleCloudPlatform · Sep 4, 2024 · 8027565 · 8027565
1 parent c872599
commit 8027565
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 2 deletions.
diff --git a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt
@@ -24,7 +24,7 @@ ninja  # For faster builds.
 psutil
 ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
-numpy
+numpy < 2.0
 torch == 2.1.1
 transformers >= 4.37.0 # Required for Qwen2
 xformers == 0.0.23

diff --git a/benchmarks/inference-server/vllm/main.tf b/benchmarks/inference-server/vllm/main.tf
@@ -43,6 +43,7 @@ locals {
     ? null
     : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}"
   )
+  vllm_podmonitoring = "${path.module}/monitoring-templates/vllm-podmonitoring.yaml.tftpl"
 }
 
 resource "kubernetes_manifest" "default" {
@@ -59,3 +60,9 @@ resource "kubernetes_manifest" "default" {
     create = "60m"
   }
 }
+
+resource "kubernetes_manifest" "vllm-pod-monitoring" {
+  manifest = yamldecode(templatefile(local.vllm_podmonitoring, {
+    namespace = var.namespace
+  }))
+}
diff --git a/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl
@@ -55,7 +55,7 @@ spec:
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args: ["--model", "${model_id}", "--tensor-parallel-size", "${gpu_count}", "--port", "80", "--swap-space", "${swap_space}", "--disable-log-requests"]
           env:
-            - name: PORT
+            - name: VLLM_PORT
               value: 80
 %{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
             - name: HUGGING_FACE_HUB_TOKEN # Related token consumption

diff --git a/benchmarks/inference-server/vllm/monitoring-templates/vllm-podmonitoring.yaml.tftpl b/benchmarks/inference-server/vllm/monitoring-templates/vllm-podmonitoring.yaml.tftpl
@@ -0,0 +1,12 @@
+apiVersion: monitoring.googleapis.com/v1
+kind: PodMonitoring
+metadata:
+  name: "vllm-podmonitoring"
+  namespace: ${namespace}
+spec:
+  selector:
+    matchLabels:
+      app: vllm
+  endpoints:
+  - port: 80
+    interval: 15s