Support autoscaling TGI servers based on DCGM metrics (#318)

GoogleCloudPlatform · Mar 11, 2024 · bd289db · bd289db
1 parent 56762ac
commit bd289db
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 4 deletions.
diff --git a/...ks/inference-server/text-generation-inference/hpa-templates/dcgm-podmonitoring.yaml.tftpl b/...ks/inference-server/text-generation-inference/hpa-templates/dcgm-podmonitoring.yaml.tftpl
@@ -0,0 +1,44 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: monitoring.googleapis.com/v1
+kind: PodMonitoring
+metadata:
+  name: nvidia-dcgm-exporter-for-hpa
+  namespace: gmp-public
+  labels:
+    app.kubernetes.io/name: nvidia-dcgm-exporter
+    app.kubernetes.io/part-of: google-cloud-managed-prometheus
+spec:
+  selector:
+    matchLabels:
+      app: nvidia-dcgm-exporter
+  endpoints:
+    - port: metrics
+      interval: 30s
+      metricRelabeling:
+        # Change the DCGM metric name that we want to use in HPA to lowercase.
+        # This is because HPA doesn't work with uppercase external metrics:
+        # https://github.com/kubernetes/kubernetes/issues/72996
+        #
+        # GMP will generate two metrics here. A gauge metric, suffixed with
+        # `unknown` and a counter metric, suffixed with `unknown:counter`.
+        # https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#unknown-double-written
+        - action: keep
+          sourceLabels: [__name__]
+        - action: replace
+          sourceLabels: [__name__]
+          targetLabel: __name__
+          regex: ${custom_metric_name}
+          replacement: ${lower(custom_metric_name)}
diff --git a/...inference-server/text-generation-inference/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl b/...inference-server/text-generation-inference/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl
@@ -11,10 +11,20 @@ spec:
   minReplicas: ${hpa_min_replicas}
   maxReplicas: ${hpa_max_replicas}
   metrics:
+%{ if length(regexall("DCGM_.*", custom_metric_name)) > 0 }
+  - type: External
+    external:
+      metric:
+        name: prometheus.googleapis.com|${lower(custom_metric_name)}|unknown
+      target:
+        type: AverageValue
+        averageValue: ${hpa_averagevalue_target}
+%{ else }
   - type: Pods
     pods:
       metric:
         name: prometheus.googleapis.com|${custom_metric_name}|gauge
       target:
         type: AverageValue
         averageValue: ${hpa_averagevalue_target}
+%{ endif }
diff --git a/benchmarks/inference-server/text-generation-inference/main.tf b/benchmarks/inference-server/text-generation-inference/main.tf
@@ -21,7 +21,10 @@ locals {
   hpa_cpu_template           = "${path.module}/hpa-templates/hpa.cpu.yaml.tftpl"
   hpa_custom_metric_template = "${path.module}/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl"
   tgi_podmonitoring          = "${path.module}/hpa-templates/tgi-podmonitoring.yaml.tftpl"
-  custom_metrics_enabled     = !(var.hpa_type == null || var.hpa_type == "cpu")
+  dcgm_podmonitoring         = "${path.module}/hpa-templates/dcgm-podmonitoring.yaml.tftpl"
+  dcgm_metrics_enabled       = var.hpa_type == null ? false : length(regexall("DCGM_.*", var.hpa_type)) > 0
+  tgi_metrics_enabled        = var.hpa_type == null ? false : length(regexall("tgi_.*", var.hpa_type)) > 0
+  custom_metrics_enabled     = local.dcgm_metrics_enabled || local.tgi_metrics_enabled
 
   wl_templates = [
     for f in fileset(local.wl_templates_path, "*tftpl") :
@@ -84,12 +87,19 @@ resource "kubernetes_manifest" "hpa-cpu" {
 }
 
 resource "kubernetes_manifest" "tgi-pod-monitoring" {
-  count = local.custom_metrics_enabled ? 1 : 0
+  count = local.tgi_metrics_enabled ? 1 : 0
   manifest = yamldecode(templatefile(local.tgi_podmonitoring, {
     namespace = var.namespace
   }))
 }
 
+resource "kubernetes_manifest" "dcgm-pod-monitoring" {
+  count = local.dcgm_metrics_enabled ? 1 : 0
+  manifest = yamldecode(templatefile(local.dcgm_podmonitoring, {
+    custom_metric_name = var.hpa_type
+  }))
+}
+
 resource "kubernetes_manifest" "hpa_custom_metric" {
   count = local.custom_metrics_enabled ? 1 : 0
   manifest = yamldecode(templatefile(local.hpa_custom_metric_template, {

diff --git a/benchmarks/inference-server/text-generation-inference/variables.tf b/benchmarks/inference-server/text-generation-inference/variables.tf
@@ -97,8 +97,8 @@ variable "hpa_type" {
   default     = null
   nullable    = true
   validation {
-    condition     = var.hpa_type == null ? true : contains(["cpu", "tgi_queue_size", "tgi_batch_current_size", "tgi_batch_current_max_tokens"], var.hpa_type)
-    error_message = "Allows values for hpa_type are {null, \"cpu\", \"tgi_queue_size\", \"tgi_batch_current_size\", \"tgi_batch_current_max_tokens\"}"
+    condition     = var.hpa_type == null ? true : length(regexall("cpu|tgi_.*|DCGM_.*", var.hpa_type)) > 0
+    error_message = "Allows values for hpa_type are {null, \"cpu\", TGI metrics (e.g., \"tgi_queue_size\", \"tgi_batch_current_size\") or DCGM metrics (e.g., \"DCGM_FI_DEV_MEM_COPY_UTIL\") }"
   }
 }