diff --git a/benchmarks/inference-server/text-generation-inference/hpa-templates/dcgm-podmonitoring.yaml.tftpl b/benchmarks/inference-server/text-generation-inference/hpa-templates/dcgm-podmonitoring.yaml.tftpl new file mode 100644 index 000000000..0c45ba49d --- /dev/null +++ b/benchmarks/inference-server/text-generation-inference/hpa-templates/dcgm-podmonitoring.yaml.tftpl @@ -0,0 +1,44 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: monitoring.googleapis.com/v1 +kind: PodMonitoring +metadata: + name: nvidia-dcgm-exporter-for-hpa + namespace: gmp-public + labels: + app.kubernetes.io/name: nvidia-dcgm-exporter + app.kubernetes.io/part-of: google-cloud-managed-prometheus +spec: + selector: + matchLabels: + app: nvidia-dcgm-exporter + endpoints: + - port: metrics + interval: 30s + metricRelabeling: + # Change the DCGM metric name that we want to use in HPA to lowercase. + # This is because HPA doesn't work with uppercase external metrics: + # https://github.com/kubernetes/kubernetes/issues/72996 + # + # GMP will generate two metrics here. A gauge metric, suffixed with + # `unknown` and a counter metric, suffixed with `unknown:counter`. + # https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#unknown-double-written + - action: keep + sourceLabels: [__name__] + - action: replace + sourceLabels: [__name__] + targetLabel: __name__ + regex: ${custom_metric_name} + replacement: ${lower(custom_metric_name)} diff --git a/benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl b/benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl index aca29da46..62dc98b90 100644 --- a/benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl +++ b/benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl @@ -11,6 +11,15 @@ spec: minReplicas: ${hpa_min_replicas} maxReplicas: ${hpa_max_replicas} metrics: +%{ if length(regexall("DCGM_.*", custom_metric_name)) > 0 } + - type: External + external: + metric: + name: prometheus.googleapis.com|${lower(custom_metric_name)}|unknown + target: + type: AverageValue + averageValue: ${hpa_averagevalue_target} +%{ else } - type: Pods pods: metric: @@ -18,3 +27,4 @@ spec: target: type: AverageValue averageValue: ${hpa_averagevalue_target} +%{ endif } diff --git a/benchmarks/inference-server/text-generation-inference/main.tf b/benchmarks/inference-server/text-generation-inference/main.tf index 95043b409..1a9a711f2 100644 --- a/benchmarks/inference-server/text-generation-inference/main.tf +++ b/benchmarks/inference-server/text-generation-inference/main.tf @@ -21,7 +21,10 @@ locals { hpa_cpu_template = "${path.module}/hpa-templates/hpa.cpu.yaml.tftpl" hpa_custom_metric_template = "${path.module}/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl" tgi_podmonitoring = "${path.module}/hpa-templates/tgi-podmonitoring.yaml.tftpl" - custom_metrics_enabled = !(var.hpa_type == null || var.hpa_type == "cpu") + dcgm_podmonitoring = "${path.module}/hpa-templates/dcgm-podmonitoring.yaml.tftpl" + dcgm_metrics_enabled = var.hpa_type == null ? false : length(regexall("DCGM_.*", var.hpa_type)) > 0 + tgi_metrics_enabled = var.hpa_type == null ? false : length(regexall("tgi_.*", var.hpa_type)) > 0 + custom_metrics_enabled = local.dcgm_metrics_enabled || local.tgi_metrics_enabled wl_templates = [ for f in fileset(local.wl_templates_path, "*tftpl") : @@ -84,12 +87,19 @@ resource "kubernetes_manifest" "hpa-cpu" { } resource "kubernetes_manifest" "tgi-pod-monitoring" { - count = local.custom_metrics_enabled ? 1 : 0 + count = local.tgi_metrics_enabled ? 1 : 0 manifest = yamldecode(templatefile(local.tgi_podmonitoring, { namespace = var.namespace })) } +resource "kubernetes_manifest" "dcgm-pod-monitoring" { + count = local.dcgm_metrics_enabled ? 1 : 0 + manifest = yamldecode(templatefile(local.dcgm_podmonitoring, { + custom_metric_name = var.hpa_type + })) +} + resource "kubernetes_manifest" "hpa_custom_metric" { count = local.custom_metrics_enabled ? 1 : 0 manifest = yamldecode(templatefile(local.hpa_custom_metric_template, { diff --git a/benchmarks/inference-server/text-generation-inference/variables.tf b/benchmarks/inference-server/text-generation-inference/variables.tf index db8ad6e7a..1a90313a8 100644 --- a/benchmarks/inference-server/text-generation-inference/variables.tf +++ b/benchmarks/inference-server/text-generation-inference/variables.tf @@ -97,8 +97,8 @@ variable "hpa_type" { default = null nullable = true validation { - condition = var.hpa_type == null ? true : contains(["cpu", "tgi_queue_size", "tgi_batch_current_size", "tgi_batch_current_max_tokens"], var.hpa_type) - error_message = "Allows values for hpa_type are {null, \"cpu\", \"tgi_queue_size\", \"tgi_batch_current_size\", \"tgi_batch_current_max_tokens\"}" + condition = var.hpa_type == null ? true : length(regexall("cpu|tgi_.*|DCGM_.*", var.hpa_type)) > 0 + error_message = "Allows values for hpa_type are {null, \"cpu\", TGI metrics (e.g., \"tgi_queue_size\", \"tgi_batch_current_size\") or DCGM metrics (e.g., \"DCGM_FI_DEV_MEM_COPY_UTIL\") }" } }