GoogleCloudPlatform · annapendleton · Sep 4, 2024 · Sep 4, 2024 · Sep 4, 2024
@@ -24,7 +24,7 @@ ninja  # For faster builds.
 psutil
 ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
-numpy
+numpy < 2.0
 torch == 2.1.1
 transformers >= 4.37.0 # Required for Qwen2
 xformers == 0.0.23

@@ -43,6 +43,7 @@ locals {
     ? null
     : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}"
   )
+  vllm_podmonitoring = "${path.module}/monitoring-templates/vllm-podmonitoring.yaml.tftpl"
 }
 
 resource "kubernetes_manifest" "default" {
@@ -59,3 +60,9 @@ resource "kubernetes_manifest" "default" {
     create = "60m"
   }
 }
+
+resource "kubernetes_manifest" "vllm-pod-monitoring" {
+  manifest = yamldecode(templatefile(local.vllm_podmonitoring, {
+    namespace = var.namespace
+  }))
+}
@@ -55,7 +55,7 @@ spec:
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args: ["--model", "${model_id}", "--tensor-parallel-size", "${gpu_count}", "--port", "80", "--swap-space", "${swap_space}", "--disable-log-requests"]
           env:
-            - name: PORT
+            - name: VLLM_PORT
               value: 80
 %{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
             - name: HUGGING_FACE_HUB_TOKEN # Related token consumption

@@ -0,0 +1,12 @@
+apiVersion: monitoring.googleapis.com/v1
+kind: PodMonitoring
+metadata:
+  name: "vllm-podmonitoring"
+  namespace: ${namespace}
+spec:
+  selector:
+    matchLabels:
+      app: vllm
+  endpoints:
+  - port: 80
+    interval: 15s