Skip to content

Commit

Permalink
enable vllm autoscaling with cmsa (#825)
Browse files Browse the repository at this point in the history
* enable vllm autoscaling with cmsa

* small fixes

* fmt
  • Loading branch information
annapendleton authored Sep 18, 2024
1 parent dff2b93 commit 6e48fd5
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: vllm
namespace: ${namespace}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: vllm
minReplicas: ${hpa_min_replicas}
maxReplicas: ${hpa_max_replicas}
metrics:
- type: Pods
pods:
metric:
name: prometheus.googleapis.com|${custom_metric_name}|gauge
target:
type: AverageValue
averageValue: ${hpa_averagevalue_target}
25 changes: 25 additions & 0 deletions benchmarks/inference-server/vllm/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@

locals {

hpa_custom_metric_template = "${path.module}/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl"
use_vllm_metrics_for_hpa = var.hpa_type == null ? false : length(regexall("vllm.*", var.hpa_type)) > 0
custom_metrics_enabled = local.use_vllm_metrics_for_hpa


all_templates = concat(local.wl_templates, local.secret_templates)

wl_templates = [
Expand Down Expand Up @@ -66,3 +71,23 @@ resource "kubernetes_manifest" "vllm-pod-monitoring" {
namespace = var.namespace
}))
}

module "custom_metrics_stackdriver_adapter" {
count = local.custom_metrics_enabled ? 1 : 0
source = "../../../modules/custom-metrics-stackdriver-adapter"
workload_identity = {
enabled = true
project_id = var.project_id
}
}

resource "kubernetes_manifest" "hpa_custom_metric" {
count = local.custom_metrics_enabled ? 1 : 0
manifest = yamldecode(templatefile(local.hpa_custom_metric_template, {
namespace = var.namespace
custom_metric_name = var.hpa_type
hpa_averagevalue_target = var.hpa_averagevalue_target
hpa_min_replicas = var.hpa_min_replicas
hpa_max_replicas = var.hpa_max_replicas
}))
}
16 changes: 16 additions & 0 deletions benchmarks/inference-server/vllm/sample-terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,19 @@ ksa = "benchmark-ksa"
model_id = "tiiuae/falcon-7b"
gpu_count = 1
project_id = "<project_id>"

# How to (horizontally) scale the workload. Allowed values are:
# - Workload metrics (i.e. custom metrics):
# - "vllm:gpu_cache_usage_perc"
# - "vllm:num_requests_waiting"
# - Other possibilities coming soon...
#
# See `autoscaling.md` for more details and recommendations.
# hpa_type = "vllm:gpu_cache_usage_perc"

# Sets the averagevalue target of the hpa metric.
# hpa_averagevalue_target = 0.95

# Adjust these if you want different min/max values
# hpa_min_replicas = 1
# hpa_max_replicas = 5
34 changes: 34 additions & 0 deletions benchmarks/inference-server/vllm/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,37 @@ variable "project_id" {
description = "Project id of existing or created project."
type = string
}

variable "hpa_type" {
description = "How the vllm workload should be scaled."
type = string
default = null
nullable = true
validation {
condition = var.hpa_type == null ? true : length(regexall("vllm.*", var.hpa_type)) > 0
error_message = "Allows values for hpa_type are {null, or vLLM metrics (e.g., \"vllm:num_requests_waiting\", \"vllm:gpu_cache_usage_perc\")}"
}
}

variable "hpa_min_replicas" {
description = "Minimum number of HPA replicas."
type = number
default = 1
nullable = false
}

variable "hpa_max_replicas" {
description = "Maximum number of HPA replicas."
type = number
default = 5
nullable = false
}

# TODO: combine hpa variables into a single object (so that they can be
# validated together)
variable "hpa_averagevalue_target" {
description = "AverageValue target for the `hpa_type` metric. Must be set if `hpa_type` is not null."
type = number
default = null
nullable = true
}

0 comments on commit 6e48fd5

Please sign in to comment.