diff --git a/benchmarks/inference-server/vllm/main.tf b/benchmarks/inference-server/vllm/main.tf index 4161e0347..cc2b8f1fc 100644 --- a/benchmarks/inference-server/vllm/main.tf +++ b/benchmarks/inference-server/vllm/main.tf @@ -43,6 +43,7 @@ locals { ? null : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}" ) + vllm_podmonitoring = "${path.module}/monitoring-templates/vllm-podmonitoring.yaml.tftpl" } resource "kubernetes_manifest" "default" { @@ -59,3 +60,9 @@ resource "kubernetes_manifest" "default" { create = "60m" } } + +resource "kubernetes_manifest" "vllm-pod-monitoring" { + manifest = yamldecode(templatefile(local.vllm_podmonitoring, { + namespace = var.namespace + })) +} diff --git a/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl index 169ec4356..336c45167 100644 --- a/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl +++ b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl @@ -55,7 +55,7 @@ spec: command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: ["--model", "${model_id}", "--tensor-parallel-size", "${gpu_count}", "--port", "80", "--swap-space", "${swap_space}", "--disable-log-requests"] env: - - name: PORT + - name: VLLM_PORT value: 80 %{ for hugging_face_token_secret in hugging_face_token_secret_list ~} - name: HUGGING_FACE_HUB_TOKEN # Related token consumption diff --git a/benchmarks/inference-server/vllm/monitoring-templates/vllm-podmonitoring.yaml.tftpl b/benchmarks/inference-server/vllm/monitoring-templates/vllm-podmonitoring.yaml.tftpl new file mode 100644 index 000000000..f582fc76e --- /dev/null +++ b/benchmarks/inference-server/vllm/monitoring-templates/vllm-podmonitoring.yaml.tftpl @@ -0,0 +1,12 @@ +apiVersion: monitoring.googleapis.com/v1 +kind: PodMonitoring +metadata: + name: "vllm-podmonitoring" + namespace: ${namespace} +spec: + selector: + matchLabels: + app: vllm + endpoints: + - port: 80 + interval: 15s