From 85302075602aa2d502c7504d2585adff8d6b252e Mon Sep 17 00:00:00 2001 From: Anna Pendleton Date: Wed, 18 Sep 2024 17:23:31 +0000 Subject: [PATCH 1/3] enable vllm autoscaling with cmsa --- .../hpa.vllm.custom_metric.yaml.tftpl | 30 ++++++++++++++++ benchmarks/inference-server/vllm/main.tf | 25 +++++++++++++ .../vllm/sample-terraform.tfvars | 16 +++++++++ benchmarks/inference-server/vllm/variables.tf | 36 +++++++++++++++++++ 4 files changed, 107 insertions(+) create mode 100644 benchmarks/inference-server/vllm/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl diff --git a/benchmarks/inference-server/vllm/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl b/benchmarks/inference-server/vllm/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl new file mode 100644 index 000000000..2c318d848 --- /dev/null +++ b/benchmarks/inference-server/vllm/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl @@ -0,0 +1,30 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: vllm + namespace: ${namespace} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: vllm + minReplicas: ${hpa_min_replicas} + maxReplicas: ${hpa_max_replicas} + metrics: +%{ if length(regexall("DCGM_.*", custom_metric_name)) > 0 } + - type: External + external: + metric: + name: prometheus.googleapis.com|${lower(custom_metric_name)}|unknown + target: + type: AverageValue + averageValue: ${hpa_averagevalue_target} +%{ else } + - type: Pods + pods: + metric: + name: prometheus.googleapis.com|${custom_metric_name}|gauge + target: + type: AverageValue + averageValue: ${hpa_averagevalue_target} +%{ endif } diff --git a/benchmarks/inference-server/vllm/main.tf b/benchmarks/inference-server/vllm/main.tf index 7627efa5e..5701aba14 100644 --- a/benchmarks/inference-server/vllm/main.tf +++ b/benchmarks/inference-server/vllm/main.tf @@ -16,6 +16,11 @@ locals { + hpa_custom_metric_template = "${path.module}/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl" + use_vllm_metrics_for_hpa = var.hpa_type == null ? false : length(regexall("vllm.*", var.hpa_type)) > 0 + custom_metrics_enabled = local.use_vllm_metrics_for_hpa + + all_templates = concat(local.wl_templates, local.secret_templates) wl_templates = [ @@ -66,3 +71,23 @@ resource "kubernetes_manifest" "vllm-pod-monitoring" { namespace = var.namespace })) } + +module "custom_metrics_stackdriver_adapter" { + count = local.custom_metrics_enabled ? 1 : 0 + source = "../../../modules/custom-metrics-stackdriver-adapter" + workload_identity = { + enabled = true + project_id = var.project_id + } +} + +resource "kubernetes_manifest" "hpa_custom_metric" { + count = local.custom_metrics_enabled ? 1 : 0 + manifest = yamldecode(templatefile(local.hpa_custom_metric_template, { + namespace = var.namespace + custom_metric_name = var.hpa_type + hpa_averagevalue_target = var.hpa_averagevalue_target + hpa_min_replicas = var.hpa_min_replicas + hpa_max_replicas = var.hpa_max_replicas + })) +} diff --git a/benchmarks/inference-server/vllm/sample-terraform.tfvars b/benchmarks/inference-server/vllm/sample-terraform.tfvars index 51d0cf466..cbe79c669 100644 --- a/benchmarks/inference-server/vllm/sample-terraform.tfvars +++ b/benchmarks/inference-server/vllm/sample-terraform.tfvars @@ -7,3 +7,19 @@ ksa = "benchmark-ksa" model_id = "tiiuae/falcon-7b" gpu_count = 1 project_id = "" + +# How to (horizontally) scale the workload. Allowed values are: +# - Workload metrics (i.e. custom metrics): +# - "vllm:gpu_cache_usage_perc" +# - "vllm:num_requests_waiting" +# - Other possibilities coming soon... +# +# See `autoscaling.md` for more details and recommendations. +# hpa_type = "vllm:gpu_cache_usage_perc" + +# Sets the averagevalue target of the hpa metric. +# hpa_averagevalue_target = 0.95 + +# Adjust these if you want different min/max values +# hpa_min_replicas = 1 +# hpa_max_replicas = 5 \ No newline at end of file diff --git a/benchmarks/inference-server/vllm/variables.tf b/benchmarks/inference-server/vllm/variables.tf index ee8e4428b..71ff800ee 100644 --- a/benchmarks/inference-server/vllm/variables.tf +++ b/benchmarks/inference-server/vllm/variables.tf @@ -106,3 +106,39 @@ variable "project_id" { description = "Project id of existing or created project." type = string } + + + +variable "hpa_type" { + description = "How the TGI workload should be scaled." + type = string + default = null + nullable = true + validation { + condition = var.hpa_type == null ? true : length(regexall("cpu|vllm.*|DCGM_.*", var.hpa_type)) > 0 + error_message = "Allows values for hpa_type are {null, or vLLM metrics (e.g., \"vllm:num_requests_waiting\", \"vllm:gpu_cache_usage_perc\")}" + } +} + +variable "hpa_min_replicas" { + description = "Minimum number of HPA replicas." + type = number + default = 1 + nullable = false +} + +variable "hpa_max_replicas" { + description = "Maximum number of HPA replicas." + type = number + default = 5 + nullable = false +} + +# TODO: combine hpa variables into a single object (so that they can be +# validated together) +variable "hpa_averagevalue_target" { + description = "AverageValue target for the `hpa_type` metric. Must be set if `hpa_type` is not null." + type = number + default = null + nullable = true +} \ No newline at end of file From 76cdcc180998e57b21634717ffdcb1368d11915e Mon Sep 17 00:00:00 2001 From: Anna Pendleton Date: Wed, 18 Sep 2024 17:26:41 +0000 Subject: [PATCH 2/3] small fixes --- .../hpa-templates/hpa.vllm.custom_metric.yaml.tftpl | 10 ---------- benchmarks/inference-server/vllm/variables.tf | 6 ++---- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/benchmarks/inference-server/vllm/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl b/benchmarks/inference-server/vllm/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl index 2c318d848..956241afd 100644 --- a/benchmarks/inference-server/vllm/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl +++ b/benchmarks/inference-server/vllm/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl @@ -11,15 +11,6 @@ spec: minReplicas: ${hpa_min_replicas} maxReplicas: ${hpa_max_replicas} metrics: -%{ if length(regexall("DCGM_.*", custom_metric_name)) > 0 } - - type: External - external: - metric: - name: prometheus.googleapis.com|${lower(custom_metric_name)}|unknown - target: - type: AverageValue - averageValue: ${hpa_averagevalue_target} -%{ else } - type: Pods pods: metric: @@ -27,4 +18,3 @@ spec: target: type: AverageValue averageValue: ${hpa_averagevalue_target} -%{ endif } diff --git a/benchmarks/inference-server/vllm/variables.tf b/benchmarks/inference-server/vllm/variables.tf index 71ff800ee..a59e4726f 100644 --- a/benchmarks/inference-server/vllm/variables.tf +++ b/benchmarks/inference-server/vllm/variables.tf @@ -107,15 +107,13 @@ variable "project_id" { type = string } - - variable "hpa_type" { - description = "How the TGI workload should be scaled." + description = "How the vllm workload should be scaled." type = string default = null nullable = true validation { - condition = var.hpa_type == null ? true : length(regexall("cpu|vllm.*|DCGM_.*", var.hpa_type)) > 0 + condition = var.hpa_type == null ? true : length(regexall("vllm.*", var.hpa_type)) > 0 error_message = "Allows values for hpa_type are {null, or vLLM metrics (e.g., \"vllm:num_requests_waiting\", \"vllm:gpu_cache_usage_perc\")}" } } From 7b07e79cacb4c6f28076f4c13f2cc25ba93f7fe9 Mon Sep 17 00:00:00 2001 From: Anna Pendleton Date: Wed, 18 Sep 2024 18:10:26 +0000 Subject: [PATCH 3/3] fmt --- benchmarks/inference-server/vllm/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/inference-server/vllm/main.tf b/benchmarks/inference-server/vllm/main.tf index 5701aba14..997dc7571 100644 --- a/benchmarks/inference-server/vllm/main.tf +++ b/benchmarks/inference-server/vllm/main.tf @@ -17,7 +17,7 @@ locals { hpa_custom_metric_template = "${path.module}/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl" - use_vllm_metrics_for_hpa = var.hpa_type == null ? false : length(regexall("vllm.*", var.hpa_type)) > 0 + use_vllm_metrics_for_hpa = var.hpa_type == null ? false : length(regexall("vllm.*", var.hpa_type)) > 0 custom_metrics_enabled = local.use_vllm_metrics_for_hpa