Skip to content

Commit

Permalink
Support autoscaling TGI servers based on DCGM metrics (#318)
Browse files Browse the repository at this point in the history
  • Loading branch information
laoj2 authored Mar 11, 2024
1 parent 56762ac commit bd289db
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 4 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: monitoring.googleapis.com/v1
kind: PodMonitoring
metadata:
name: nvidia-dcgm-exporter-for-hpa
namespace: gmp-public
labels:
app.kubernetes.io/name: nvidia-dcgm-exporter
app.kubernetes.io/part-of: google-cloud-managed-prometheus
spec:
selector:
matchLabels:
app: nvidia-dcgm-exporter
endpoints:
- port: metrics
interval: 30s
metricRelabeling:
# Change the DCGM metric name that we want to use in HPA to lowercase.
# This is because HPA doesn't work with uppercase external metrics:
# https://github.com/kubernetes/kubernetes/issues/72996
#
# GMP will generate two metrics here. A gauge metric, suffixed with
# `unknown` and a counter metric, suffixed with `unknown:counter`.
# https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#unknown-double-written
- action: keep
sourceLabels: [__name__]
- action: replace
sourceLabels: [__name__]
targetLabel: __name__
regex: ${custom_metric_name}
replacement: ${lower(custom_metric_name)}
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,20 @@ spec:
minReplicas: ${hpa_min_replicas}
maxReplicas: ${hpa_max_replicas}
metrics:
%{ if length(regexall("DCGM_.*", custom_metric_name)) > 0 }
- type: External
external:
metric:
name: prometheus.googleapis.com|${lower(custom_metric_name)}|unknown
target:
type: AverageValue
averageValue: ${hpa_averagevalue_target}
%{ else }
- type: Pods
pods:
metric:
name: prometheus.googleapis.com|${custom_metric_name}|gauge
target:
type: AverageValue
averageValue: ${hpa_averagevalue_target}
%{ endif }
14 changes: 12 additions & 2 deletions benchmarks/inference-server/text-generation-inference/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ locals {
hpa_cpu_template = "${path.module}/hpa-templates/hpa.cpu.yaml.tftpl"
hpa_custom_metric_template = "${path.module}/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl"
tgi_podmonitoring = "${path.module}/hpa-templates/tgi-podmonitoring.yaml.tftpl"
custom_metrics_enabled = !(var.hpa_type == null || var.hpa_type == "cpu")
dcgm_podmonitoring = "${path.module}/hpa-templates/dcgm-podmonitoring.yaml.tftpl"
dcgm_metrics_enabled = var.hpa_type == null ? false : length(regexall("DCGM_.*", var.hpa_type)) > 0
tgi_metrics_enabled = var.hpa_type == null ? false : length(regexall("tgi_.*", var.hpa_type)) > 0
custom_metrics_enabled = local.dcgm_metrics_enabled || local.tgi_metrics_enabled

wl_templates = [
for f in fileset(local.wl_templates_path, "*tftpl") :
Expand Down Expand Up @@ -84,12 +87,19 @@ resource "kubernetes_manifest" "hpa-cpu" {
}

resource "kubernetes_manifest" "tgi-pod-monitoring" {
count = local.custom_metrics_enabled ? 1 : 0
count = local.tgi_metrics_enabled ? 1 : 0
manifest = yamldecode(templatefile(local.tgi_podmonitoring, {
namespace = var.namespace
}))
}

resource "kubernetes_manifest" "dcgm-pod-monitoring" {
count = local.dcgm_metrics_enabled ? 1 : 0
manifest = yamldecode(templatefile(local.dcgm_podmonitoring, {
custom_metric_name = var.hpa_type
}))
}

resource "kubernetes_manifest" "hpa_custom_metric" {
count = local.custom_metrics_enabled ? 1 : 0
manifest = yamldecode(templatefile(local.hpa_custom_metric_template, {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ variable "hpa_type" {
default = null
nullable = true
validation {
condition = var.hpa_type == null ? true : contains(["cpu", "tgi_queue_size", "tgi_batch_current_size", "tgi_batch_current_max_tokens"], var.hpa_type)
error_message = "Allows values for hpa_type are {null, \"cpu\", \"tgi_queue_size\", \"tgi_batch_current_size\", \"tgi_batch_current_max_tokens\"}"
condition = var.hpa_type == null ? true : length(regexall("cpu|tgi_.*|DCGM_.*", var.hpa_type)) > 0
error_message = "Allows values for hpa_type are {null, \"cpu\", TGI metrics (e.g., \"tgi_queue_size\", \"tgi_batch_current_size\") or DCGM metrics (e.g., \"DCGM_FI_DEV_MEM_COPY_UTIL\") }"
}
}

Expand Down

0 comments on commit bd289db

Please sign in to comment.