GoogleCloudPlatform · annapendleton · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024
@@ -0,0 +1,20 @@
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: vllm
+  namespace: ${namespace}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: vllm
+  minReplicas: ${hpa_min_replicas}
+  maxReplicas: ${hpa_max_replicas}
+  metrics:
+  - type: Pods
+    pods:
+      metric:
+        name: prometheus.googleapis.com|${custom_metric_name}|gauge
+      target:
+        type: AverageValue
+        averageValue: ${hpa_averagevalue_target}
@@ -16,6 +16,11 @@
 
 locals {
 
+  hpa_custom_metric_template = "${path.module}/hpa-templates/hpa.vllm.custom_metric.yaml.tftpl"
+  use_vllm_metrics_for_hpa    = var.hpa_type == null ? false : length(regexall("vllm.*", var.hpa_type)) > 0
+  custom_metrics_enabled     = local.use_vllm_metrics_for_hpa
+
+
   all_templates = concat(local.wl_templates, local.secret_templates)
 
   wl_templates = [
@@ -66,3 +71,23 @@ resource "kubernetes_manifest" "vllm-pod-monitoring" {
     namespace = var.namespace
   }))
 }
+
+module "custom_metrics_stackdriver_adapter" {
+  count  = local.custom_metrics_enabled ? 1 : 0
+  source = "../../../modules/custom-metrics-stackdriver-adapter"
+  workload_identity = {
+    enabled    = true
+    project_id = var.project_id
+  }
+}
+
+resource "kubernetes_manifest" "hpa_custom_metric" {
+  count = local.custom_metrics_enabled ? 1 : 0
+  manifest = yamldecode(templatefile(local.hpa_custom_metric_template, {
+    namespace               = var.namespace
+    custom_metric_name      = var.hpa_type
+    hpa_averagevalue_target = var.hpa_averagevalue_target
+    hpa_min_replicas        = var.hpa_min_replicas
+    hpa_max_replicas        = var.hpa_max_replicas
+  }))
+}
@@ -7,3 +7,19 @@ ksa        = "benchmark-ksa"
 model_id   = "tiiuae/falcon-7b"
 gpu_count  = 1
 project_id = "<project_id>"
+
+# How to (horizontally) scale the workload. Allowed values are:
+# - Workload metrics (i.e. custom metrics):
+#   - "vllm:gpu_cache_usage_perc"
+#   - "vllm:num_requests_waiting"
+# - Other possibilities coming soon...
+#
+# See `autoscaling.md` for more details and recommendations.
+# hpa_type = "vllm:gpu_cache_usage_perc"
+
+# Sets the averagevalue target of the hpa metric.
+# hpa_averagevalue_target = 0.95
+
+# Adjust these if you want different min/max values
+# hpa_min_replicas = 1
+# hpa_max_replicas = 5
@@ -106,3 +106,37 @@ variable "project_id" {
   description = "Project id of existing or created project."
   type        = string
 }
+
+variable "hpa_type" {
+  description = "How the vllm workload should be scaled."
+  type        = string
+  default     = null
+  nullable    = true
+  validation {
+    condition     = var.hpa_type == null ? true : length(regexall("vllm.*", var.hpa_type)) > 0
+    error_message = "Allows values for hpa_type are {null, or vLLM metrics (e.g., \"vllm:num_requests_waiting\", \"vllm:gpu_cache_usage_perc\")}"
+  }
+}
+
+variable "hpa_min_replicas" {
+  description = "Minimum number of HPA replicas."
+  type        = number
+  default     = 1
+  nullable    = false
+}
+
+variable "hpa_max_replicas" {
+  description = "Maximum number of HPA replicas."
+  type        = number
+  default     = 5
+  nullable    = false
+}
+
+# TODO: combine hpa variables into a single object (so that they can be
+# validated together)
+variable "hpa_averagevalue_target" {
+  description = "AverageValue target for the `hpa_type` metric. Must be set if `hpa_type` is not null."
+  type        = number
+  default     = null
+  nullable    = true
+}