From 6e48fd5d78f08c282f760406acaa8def6c8bb6b2 Mon Sep 17 00:00:00 2001 From: Anna Pendleton Date: Wed, 18 Sep 2024 11:12:08 -0700 Subject: [PATCH] enable vllm autoscaling with cmsa (#825) * enable vllm autoscaling with cmsa * small fixes * fmt --- .../hpa.vllm.custom_metric.yaml.tftpl | 20 +++++++++++ benchmarks/inference-server/vllm/main.tf | 25 ++++++++++++++ .../vllm/sample-terraform.tfvars | 16 +++++++++ benchmarks/inference-server/vllm/variables.tf | 34 +++++++++++++++++++ 4 files changed, 95 insertions(+) create mode 100644 benchmarks/inference-server/vllm/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl diff --git a/benchmarks/inference-server/vllm/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl b/benchmarks/inference-server/vllm/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl new file mode 100644 index 000000000..956241afd --- /dev/null +++ b/benchmarks/inference-server/vllm/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl @@ -0,0 +1,20 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: vllm + namespace: ${namespace} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: vllm + minReplicas: ${hpa_min_replicas} + maxReplicas: ${hpa_max_replicas} + metrics: + - type: Pods + pods: + metric: + name: prometheus.googleapis.com|${custom_metric_name}|gauge + target: + type: AverageValue + averageValue: ${hpa_averagevalue_target} diff --git a/benchmarks/inference-server/vllm/main.tf b/benchmarks/inference-server/vllm/main.tf index 7627efa5e..997dc7571 100644 --- a/benchmarks/inference-server/vllm/main.tf +++ b/benchmarks/inference-server/vllm/main.tf @@ -16,6 +16,11 @@ locals { + hpa_custom_metric_template = "${path.module}/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl" + use_vllm_metrics_for_hpa = var.hpa_type == null ? false : length(regexall("vllm.*", var.hpa_type)) > 0 + custom_metrics_enabled = local.use_vllm_metrics_for_hpa + + all_templates = concat(local.wl_templates, local.secret_templates) wl_templates = [ @@ -66,3 +71,23 @@ resource "kubernetes_manifest" "vllm-pod-monitoring" { namespace = var.namespace })) } + +module "custom_metrics_stackdriver_adapter" { + count = local.custom_metrics_enabled ? 1 : 0 + source = "../../../modules/custom-metrics-stackdriver-adapter" + workload_identity = { + enabled = true + project_id = var.project_id + } +} + +resource "kubernetes_manifest" "hpa_custom_metric" { + count = local.custom_metrics_enabled ? 1 : 0 + manifest = yamldecode(templatefile(local.hpa_custom_metric_template, { + namespace = var.namespace + custom_metric_name = var.hpa_type + hpa_averagevalue_target = var.hpa_averagevalue_target + hpa_min_replicas = var.hpa_min_replicas + hpa_max_replicas = var.hpa_max_replicas + })) +} diff --git a/benchmarks/inference-server/vllm/sample-terraform.tfvars b/benchmarks/inference-server/vllm/sample-terraform.tfvars index 51d0cf466..cbe79c669 100644 --- a/benchmarks/inference-server/vllm/sample-terraform.tfvars +++ b/benchmarks/inference-server/vllm/sample-terraform.tfvars @@ -7,3 +7,19 @@ ksa = "benchmark-ksa" model_id = "tiiuae/falcon-7b" gpu_count = 1 project_id = "" + +# How to (horizontally) scale the workload. Allowed values are: +# - Workload metrics (i.e. custom metrics): +# - "vllm:gpu_cache_usage_perc" +# - "vllm:num_requests_waiting" +# - Other possibilities coming soon... +# +# See `autoscaling.md` for more details and recommendations. +# hpa_type = "vllm:gpu_cache_usage_perc" + +# Sets the averagevalue target of the hpa metric. +# hpa_averagevalue_target = 0.95 + +# Adjust these if you want different min/max values +# hpa_min_replicas = 1 +# hpa_max_replicas = 5 \ No newline at end of file diff --git a/benchmarks/inference-server/vllm/variables.tf b/benchmarks/inference-server/vllm/variables.tf index ee8e4428b..a59e4726f 100644 --- a/benchmarks/inference-server/vllm/variables.tf +++ b/benchmarks/inference-server/vllm/variables.tf @@ -106,3 +106,37 @@ variable "project_id" { description = "Project id of existing or created project." type = string } + +variable "hpa_type" { + description = "How the vllm workload should be scaled." + type = string + default = null + nullable = true + validation { + condition = var.hpa_type == null ? true : length(regexall("vllm.*", var.hpa_type)) > 0 + error_message = "Allows values for hpa_type are {null, or vLLM metrics (e.g., \"vllm:num_requests_waiting\", \"vllm:gpu_cache_usage_perc\")}" + } +} + +variable "hpa_min_replicas" { + description = "Minimum number of HPA replicas." + type = number + default = 1 + nullable = false +} + +variable "hpa_max_replicas" { + description = "Maximum number of HPA replicas." + type = number + default = 5 + nullable = false +} + +# TODO: combine hpa variables into a single object (so that they can be +# validated together) +variable "hpa_averagevalue_target" { + description = "AverageValue target for the `hpa_type` metric. Must be set if `hpa_type` is not null." + type = number + default = null + nullable = true +} \ No newline at end of file