Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enable vllm autoscaling with cmsa #825

Merged
merged 3 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: vllm
namespace: ${namespace}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: vllm
minReplicas: ${hpa_min_replicas}
maxReplicas: ${hpa_max_replicas}
metrics:
- type: Pods
pods:
metric:
name: prometheus.googleapis.com|${custom_metric_name}|gauge
target:
type: AverageValue
averageValue: ${hpa_averagevalue_target}
25 changes: 25 additions & 0 deletions benchmarks/inference-server/vllm/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@

locals {

hpa_custom_metric_template = "${path.module}/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl"
use_vllm_metrics_for_hpa = var.hpa_type == null ? false : length(regexall("vllm.*", var.hpa_type)) > 0
custom_metrics_enabled = local.use_vllm_metrics_for_hpa


all_templates = concat(local.wl_templates, local.secret_templates)

wl_templates = [
Expand Down Expand Up @@ -66,3 +71,23 @@ resource "kubernetes_manifest" "vllm-pod-monitoring" {
namespace = var.namespace
}))
}

module "custom_metrics_stackdriver_adapter" {
count = local.custom_metrics_enabled ? 1 : 0
source = "../../../modules/custom-metrics-stackdriver-adapter"
workload_identity = {
enabled = true
project_id = var.project_id
}
}

resource "kubernetes_manifest" "hpa_custom_metric" {
count = local.custom_metrics_enabled ? 1 : 0
manifest = yamldecode(templatefile(local.hpa_custom_metric_template, {
namespace = var.namespace
custom_metric_name = var.hpa_type
hpa_averagevalue_target = var.hpa_averagevalue_target
hpa_min_replicas = var.hpa_min_replicas
hpa_max_replicas = var.hpa_max_replicas
}))
}
16 changes: 16 additions & 0 deletions benchmarks/inference-server/vllm/sample-terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,19 @@ ksa = "benchmark-ksa"
model_id = "tiiuae/falcon-7b"
gpu_count = 1
project_id = "<project_id>"

# How to (horizontally) scale the workload. Allowed values are:
# - Workload metrics (i.e. custom metrics):
# - "vllm:gpu_cache_usage_perc"
# - "vllm:num_requests_waiting"
# - Other possibilities coming soon...
#
# See `autoscaling.md` for more details and recommendations.
# hpa_type = "vllm:gpu_cache_usage_perc"

# Sets the averagevalue target of the hpa metric.
# hpa_averagevalue_target = 0.95

# Adjust these if you want different min/max values
# hpa_min_replicas = 1
# hpa_max_replicas = 5
34 changes: 34 additions & 0 deletions benchmarks/inference-server/vllm/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,37 @@ variable "project_id" {
description = "Project id of existing or created project."
type = string
}

variable "hpa_type" {
description = "How the vllm workload should be scaled."
type = string
default = null
nullable = true
validation {
condition = var.hpa_type == null ? true : length(regexall("vllm.*", var.hpa_type)) > 0
error_message = "Allows values for hpa_type are {null, or vLLM metrics (e.g., \"vllm:num_requests_waiting\", \"vllm:gpu_cache_usage_perc\")}"
}
}

variable "hpa_min_replicas" {
description = "Minimum number of HPA replicas."
type = number
default = 1
nullable = false
}

variable "hpa_max_replicas" {
description = "Maximum number of HPA replicas."
type = number
default = 5
nullable = false
}

# TODO: combine hpa variables into a single object (so that they can be
# validated together)
variable "hpa_averagevalue_target" {
description = "AverageValue target for the `hpa_type` metric. Must be set if `hpa_type` is not null."
type = number
default = null
nullable = true
}
Loading