diff --git a/benchmarks/inference-server/text-generation-inference/README.md b/benchmarks/inference-server/text-generation-inference/README.md index ee0dd3865..70413d6d6 100644 --- a/benchmarks/inference-server/text-generation-inference/README.md +++ b/benchmarks/inference-server/text-generation-inference/README.md @@ -30,6 +30,8 @@ cp sample-terraform.tfvars terraform.tfvars Fill out your `terraform.tfvars` with the desired model and server configuration, referring to the list of required and optional variables [here](#variables). Variables `credentials_config` are required. +Optionally configure HPA (Horizontal Pod Autoscaling) by setting `hpa_type`. Note: GMP (Google Managed Prometheus) must be enabled on this cluster (which is the default) to scale based on custom metrics. See `autoscaling.md` for more details. + #### Determine number of gpus `gpu_count` should be configured respective to the size of the model with some overhead for the kv cache. Here's an example on figuring out how many GPUs you need to run a model: diff --git a/benchmarks/inference-server/text-generation-inference/autoscaling.md b/benchmarks/inference-server/text-generation-inference/autoscaling.md new file mode 100644 index 000000000..2378cde9f --- /dev/null +++ b/benchmarks/inference-server/text-generation-inference/autoscaling.md @@ -0,0 +1,31 @@ +# Autoscaling TGI + +## tl;dr + +Recommendation: TODO + +## Autoscaling Options + +### CPU + +CPU scaling is a poor choice for this workload - the TGI workload starts up, +pulls the model weights, and then spends a minute or two worth of cpu time +crunching some numbers. This causes hpa to add a replica, which then spends +more cpu time, which causes hpa to add a replica, etc. Eventually, things +settle, and hpa scales down the replicas. This whole process could take up to +an hour. + +### Custom Metrics + +Workload/custom metrics can be viewed in +https://console.cloud.google.com/monitoring/metrics-explorer. (Just search for +the metric name, e.g. "tgi_batch_current_size". The full name should be +"prometheus/tgi_batch_current_size/gauge") + +#### `tgi_batch_current_size` + +TODO + +### External Metrics + +TODO diff --git a/benchmarks/inference-server/text-generation-inference/custom-metrics-stackdriver-adapter/README.md b/benchmarks/inference-server/text-generation-inference/custom-metrics-stackdriver-adapter/README.md new file mode 100644 index 000000000..ce2e38665 --- /dev/null +++ b/benchmarks/inference-server/text-generation-inference/custom-metrics-stackdriver-adapter/README.md @@ -0,0 +1,31 @@ +# Custom Metrics Stackdriver Adapter + +Adapted from https://raw.githubusercontent.com/GoogleCloudPlatform/k8s-stackdriver/master/custom-metrics-stackdriver-adapter/deploy/production/adapter_new_resource_model.yaml + +## Usage + +To use this module, include it from your main terraform config, i.e.: + +``` +module "custom_metrics_stackdriver_adapter" { + source = "./path/to/custom-metrics-stackdriver-adapter" +} +``` + +For a workload identity enabled cluster, some additional configuration is +needed: + +``` +module "custom_metrics_stackdriver_adapter" { + source = "./path/to/custom-metrics-stackdriver-adapter" + workload_identity = { + enabled = true + project_id = "" + } +} +``` + +# TODO + +This module should be moved out of the text-generation-inference subdirectory, +as it should be more broadly applicable. diff --git a/benchmarks/inference-server/text-generation-inference/custom-metrics-stackdriver-adapter/main.tf b/benchmarks/inference-server/text-generation-inference/custom-metrics-stackdriver-adapter/main.tf new file mode 100644 index 000000000..8e2a16ade --- /dev/null +++ b/benchmarks/inference-server/text-generation-inference/custom-metrics-stackdriver-adapter/main.tf @@ -0,0 +1,278 @@ +resource "kubernetes_namespace_v1" "custom-metrics" { + metadata { + name = "custom-metrics" + } +} + +resource "kubernetes_service_account_v1" "custom-metrics-stackdriver-adapter-no-wi" { + count = var.workload_identity.enabled ? 0 : 1 + metadata { + name = "custom-metrics-stackdriver-adapter" + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + } +} + +resource "kubernetes_service_account_v1" "custom-metrics-stackdriver-adapter-wi" { + count = var.workload_identity.enabled ? 1 : 0 + metadata { + name = "custom-metrics-stackdriver-adapter" + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + annotations = { + "iam.gke.io/gcp-service-account" = google_service_account.cmsa-sa[0].email + } + } +} + +resource "kubernetes_cluster_role_binding_v1" "custom-metrics-system-auth-delegator" { + metadata { + name = "custom-metrics:system:auth-delegator" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = "system:auth-delegator" + } + subject { + kind = "ServiceAccount" + name = (var.workload_identity.enabled + ? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name + : kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name + ) + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + } +} + +resource "kubernetes_role_binding_v1" "custom-metrics-auth-reader" { + metadata { + name = "custom-metrics-auth-reader" + namespace = "kube-system" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "Role" + name = "extension-apiserver-authentication-reader" + } + subject { + kind = "ServiceAccount" + name = (var.workload_identity.enabled + ? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name + : kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name + ) + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + } +} + +resource "kubernetes_cluster_role_v1" "custom-metrics-resource-reader" { + metadata { + name = "custom-metrics-resource-reader" + } + rule { + api_groups = [""] + resources = ["pods", "nodes", "nodes/stats"] + verbs = ["get", "list", "watch"] + } +} + +resource "kubernetes_cluster_role_binding_v1" "custom-metrics-resource-reader" { + metadata { + name = "custom-metrics-resource-reader" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = kubernetes_cluster_role_v1.custom-metrics-resource-reader.metadata[0].name + } + subject { + kind = "ServiceAccount" + name = (var.workload_identity.enabled + ? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name + : kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name + ) + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + } +} + +resource "kubernetes_deployment_v1" "custom-metrics-stackdriver-adapter" { + metadata { + name = "custom-metrics-stackdriver-adapter" + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + labels = { + run = "custom-metrics-stackdriver-adapter" + k8s-app = "custom-metrics-stackdriver-adapter" + } + } + spec { + replicas = 1 + + selector { + match_labels = { + run = "custom-metrics-stackdriver-adapter" + k8s-app = "custom-metrics-stackdriver-adapter" + } + } + + template { + metadata { + labels = { + run = "custom-metrics-stackdriver-adapter" + k8s-app = "custom-metrics-stackdriver-adapter" + "kubernetes.io/cluster-service" = "true" + } + } + + spec { + service_account_name = (var.workload_identity.enabled + ? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name + : kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name + ) + + container { + image = "gcr.io/gke-release/custom-metrics-stackdriver-adapter:v0.14.2-gke.0" + image_pull_policy = "Always" + name = "pod-custom-metrics-stackdriver-adapter" + command = ["/adapter", "--use-new-resource-model=true", "--fallback-for-container-metrics=true"] + resources { + limits = { + cpu = "250m" + memory = "200Mi" + } + requests = { + cpu = "250m" + memory = "200Mi" + } + } + } + } + } + } +} + +resource "kubernetes_service_v1" "custom-metrics-stackdriver-adapter" { + metadata { + name = "custom-metrics-stackdriver-adapter" + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + labels = { + run = "custom-metrics-stackdriver-adapter" + k8s-app = "custom-metrics-stackdriver-adapter" + "kubernetes.io/cluster-service" = "true" + "kubernetes.io/name" = "Adapter" + } + } + spec { + selector = { + run = "custom-metrics-stackdriver-adapter" + k8s-app = "custom-metrics-stackdriver-adapter" + } + port { + port = 443 + protocol = "TCP" + target_port = 443 + } + type = "ClusterIP" + } +} + +resource "kubernetes_api_service_v1" "v1beta1-custom-metrics-k8s-io" { + metadata { + name = "v1beta1.custom.metrics.k8s.io" + } + spec { + insecure_skip_tls_verify = true + group = "custom.metrics.k8s.io" + group_priority_minimum = 100 + version_priority = 100 + service { + name = kubernetes_service_v1.custom-metrics-stackdriver-adapter.metadata[0].name + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + } + version = "v1beta1" + } +} + +resource "kubernetes_api_service_v1" "v1beta2-custom-metrics-k8s-io" { + metadata { + name = "v1beta2.custom.metrics.k8s.io" + } + spec { + insecure_skip_tls_verify = true + group = "custom.metrics.k8s.io" + group_priority_minimum = 100 + version_priority = 200 + service { + name = kubernetes_service_v1.custom-metrics-stackdriver-adapter.metadata[0].name + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + } + version = "v1beta2" + } +} + +resource "kubernetes_api_service_v1" "v1beta1-external-metrics-k8s-io" { + metadata { + name = "v1beta1.external.metrics.k8s.io" + } + spec { + insecure_skip_tls_verify = true + group = "external.metrics.k8s.io" + group_priority_minimum = 100 + version_priority = 100 + service { + name = kubernetes_service_v1.custom-metrics-stackdriver-adapter.metadata[0].name + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + } + version = "v1beta1" + } +} + +resource "kubernetes_cluster_role_binding_v1" "external-metrics-reader" { + metadata { + name = "external-metrics-reader" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = "external-metrics-reader" + } + subject { + kind = "ServiceAccount" + name = "horizontal-pod-autoscaler" + namespace = "kube-system" + } +} + + +# If workload identity is enabled, extra steps are required. We need to: +# - create a service account +# - grant it the monitoring.viewer IAM role +# - bind it to the workload identity user for the cmsa +# - annotate the cmsa service account (done above) + +resource "google_service_account" "cmsa-sa" { + count = var.workload_identity.enabled ? 1 : 0 + account_id = "cmsa-sa" + project = var.workload_identity.project_id +} + +# Equivalent to: +# gcloud projects add-iam-policy-binding PROJECT_ID \ +# --member=serviceAccount:cmsa-sa@PROJECT_ID.iam.gserviceaccount.com \ +# --role=roles/monitoring.viewer +resource "google_project_iam_binding" "cmsa-project-binding" { + count = var.workload_identity.enabled ? 1 : 0 + project = var.workload_identity.project_id + role = "roles/monitoring.viewer" + members = [ + "serviceAccount:${google_service_account.cmsa-sa[0].account_id}@${var.workload_identity.project_id}.iam.gserviceaccount.com" + ] +} + +# Equivalent to: +# gcloud iam service-accounts add-iam-policy-binding \ +# --role roles/iam.workloadIdentityUser \ +# --member "serviceAccount:PROJECT_ID.svc.id.goog[custom-metrics/custom-metrics-stackdriver-adapter]" \ +# cmsa-sa@PROJECT_ID.iam.gserviceaccount.com +resource "google_service_account_iam_member" "cmsa-bind-to-gsa" { + count = var.workload_identity.enabled ? 1 : 0 + service_account_id = google_service_account.cmsa-sa[0].name + role = "roles/iam.workloadIdentityUser" + member = "serviceAccount:${var.workload_identity.project_id}.svc.id.goog[custom-metrics/custom-metrics-stackdriver-adapter]" +} diff --git a/benchmarks/inference-server/text-generation-inference/custom-metrics-stackdriver-adapter/variables.tf b/benchmarks/inference-server/text-generation-inference/custom-metrics-stackdriver-adapter/variables.tf new file mode 100644 index 000000000..c3b338256 --- /dev/null +++ b/benchmarks/inference-server/text-generation-inference/custom-metrics-stackdriver-adapter/variables.tf @@ -0,0 +1,16 @@ +variable "workload_identity" { + type = object({ + enabled = bool + project_id = optional(string) + }) + default = { + enabled = false + } + validation { + condition = ( + (var.workload_identity.enabled && var.workload_identity.project_id != null) + || (!var.workload_identity.enabled) + ) + error_message = "A project_id must be specified if workload_identity_enabled is set." + } +} diff --git a/benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.cpu.yaml.tftpl b/benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.cpu.yaml.tftpl index 546da4d55..139f39ee8 100644 --- a/benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.cpu.yaml.tftpl +++ b/benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.cpu.yaml.tftpl @@ -10,4 +10,4 @@ spec: name: tgi minReplicas: ${hpa_min_replicas} maxReplicas: ${hpa_max_replicas} - targetCPUUtilizationPercentage: 50 + targetCPUUtilizationPercentage: ${hpa_averagevalue_target} diff --git a/benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl b/benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl new file mode 100644 index 000000000..aca29da46 --- /dev/null +++ b/benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl @@ -0,0 +1,20 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: tgi + namespace: ${namespace} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: tgi + minReplicas: ${hpa_min_replicas} + maxReplicas: ${hpa_max_replicas} + metrics: + - type: Pods + pods: + metric: + name: prometheus.googleapis.com|${custom_metric_name}|gauge + target: + type: AverageValue + averageValue: ${hpa_averagevalue_target} diff --git a/benchmarks/inference-server/text-generation-inference/hpa-templates/tgi-podmonitoring.yaml.tftpl b/benchmarks/inference-server/text-generation-inference/hpa-templates/tgi-podmonitoring.yaml.tftpl new file mode 100644 index 000000000..9a37a6526 --- /dev/null +++ b/benchmarks/inference-server/text-generation-inference/hpa-templates/tgi-podmonitoring.yaml.tftpl @@ -0,0 +1,12 @@ +apiVersion: monitoring.googleapis.com/v1 +kind: PodMonitoring +metadata: + name: "tgi-podmonitoring" + namespace: ${namespace} +spec: + selector: + matchLabels: + app: tgi + endpoints: + - port: 80 + interval: 20s diff --git a/benchmarks/inference-server/text-generation-inference/main.tf b/benchmarks/inference-server/text-generation-inference/main.tf index 1fb665b43..95043b409 100644 --- a/benchmarks/inference-server/text-generation-inference/main.tf +++ b/benchmarks/inference-server/text-generation-inference/main.tf @@ -18,7 +18,10 @@ locals { all_templates = concat(local.wl_templates, local.secret_templates) - hpa_cpu_template = "${path.module}/hpa-templates/hpa.cpu.yaml.tftpl" + hpa_cpu_template = "${path.module}/hpa-templates/hpa.cpu.yaml.tftpl" + hpa_custom_metric_template = "${path.module}/hpa-templates/hpa.tgi.custom_metric.yaml.tftpl" + tgi_podmonitoring = "${path.module}/hpa-templates/tgi-podmonitoring.yaml.tftpl" + custom_metrics_enabled = !(var.hpa_type == null || var.hpa_type == "cpu") wl_templates = [ for f in fileset(local.wl_templates_path, "*tftpl") : @@ -47,6 +50,15 @@ locals { ) } +module "custom_metrics_stackdriver_adapter" { + count = local.custom_metrics_enabled ? 1 : 0 + source = "./custom-metrics-stackdriver-adapter" + workload_identity = { + enabled = true + project_id = var.project_id + } +} + resource "kubernetes_manifest" "default" { for_each = toset(local.all_templates) manifest = yamldecode(templatefile(each.value, { @@ -64,8 +76,27 @@ resource "kubernetes_manifest" "default" { resource "kubernetes_manifest" "hpa-cpu" { count = var.hpa_type == "cpu" ? 1 : 0 manifest = yamldecode(templatefile(local.hpa_cpu_template, { - namespace = var.namespace - hpa_min_replicas = var.hpa_min_replicas - hpa_max_replicas = var.hpa_max_replicas + namespace = var.namespace + hpa_averagevalue_target = var.hpa_averagevalue_target + hpa_min_replicas = var.hpa_min_replicas + hpa_max_replicas = var.hpa_max_replicas + })) +} + +resource "kubernetes_manifest" "tgi-pod-monitoring" { + count = local.custom_metrics_enabled ? 1 : 0 + manifest = yamldecode(templatefile(local.tgi_podmonitoring, { + namespace = var.namespace + })) +} + +resource "kubernetes_manifest" "hpa_custom_metric" { + count = local.custom_metrics_enabled ? 1 : 0 + manifest = yamldecode(templatefile(local.hpa_custom_metric_template, { + namespace = var.namespace + custom_metric_name = var.hpa_type + hpa_averagevalue_target = var.hpa_averagevalue_target + hpa_min_replicas = var.hpa_min_replicas + hpa_max_replicas = var.hpa_max_replicas })) } diff --git a/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl b/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl index 01e613828..70f234977 100644 --- a/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl +++ b/benchmarks/inference-server/text-generation-inference/manifest-templates/text-generation-inference.tftpl @@ -20,7 +20,6 @@ metadata: labels: app: tgi spec: - replicas: 1 selector: matchLabels: app: tgi @@ -75,3 +74,21 @@ spec: - mountPath: "/var/secrets" name: hftoken %{ endfor ~} + # Hack: tgi won't generate any metrics until it's served at least one + # request, and without that, autoscaling won't work. So generate one + # request upon startup. + startupProbe: + failureThreshold: 3600 + periodSeconds: 10 + timeoutSeconds: 60 + exec: + command: + - /usr/bin/curl + - http://localhost:80/generate + - -X + - POST + - -d + - '{"inputs":"test", "parameters":{"max_new_tokens":1}}' + - -H + - 'Content-Type: application/json' + - --fail diff --git a/benchmarks/inference-server/text-generation-inference/sample-terraform.tfvars b/benchmarks/inference-server/text-generation-inference/sample-terraform.tfvars index 6b885cdc1..c04015e83 100644 --- a/benchmarks/inference-server/text-generation-inference/sample-terraform.tfvars +++ b/benchmarks/inference-server/text-generation-inference/sample-terraform.tfvars @@ -9,17 +9,29 @@ gpu_count = 1 # How to (horizontally) scale the workload. Allowed values are: # - null (no scaling), -# - "cpu" (scale on cpu utilization). +# - Workload resources: +# - "cpu" (scale on cpu utilization). +# - Workload metrics (i.e. custom metrics): +# - "tgi_queue_size" +# - "tgi_batch_current_size" +# - "tgi_batch_current_max_tokens" # - Other possibilities coming soon... # -# Note: "cpu" scaling is a poor choice for this workload - the tgi workload -# starts up, pulls the model weights, and then spends a minute or two worth of -# cpu time crunching some numbers. This causes hpa to add a replica, which then -# spends more cpu time, which causes hpa to add a replica, etc. Eventually, -# things settle, and hpa scales down the replicas. This whole process could -# take up to an hour. +# See `autoscaling.md` for more details and recommendations. hpa_type = null +# Sets the averagevalue target of the hpa metric. +# +# e.g for cpu scaling, this is the cpu utilization, expressed as a value +# between 0-100. 50 is a reasonable starting point. +#hpa_averagevalue_target = 50 +# +# For tgi_batch_current_size, try 10. (TODO: experiment with this to determine +# optimal values). +#hpa_averagevalue_target = 10 + # Adjust these if you want different min/max values # hpa_min_replicas = 1 # hpa_max_replicas = 5 + +project_id = "" diff --git a/benchmarks/inference-server/text-generation-inference/variables.tf b/benchmarks/inference-server/text-generation-inference/variables.tf index b43e926ab..db8ad6e7a 100644 --- a/benchmarks/inference-server/text-generation-inference/variables.tf +++ b/benchmarks/inference-server/text-generation-inference/variables.tf @@ -97,8 +97,8 @@ variable "hpa_type" { default = null nullable = true validation { - condition = var.hpa_type == null ? true : contains(["cpu"], var.hpa_type) - error_message = "Allows values for hpa_type are {null, \"cpu\"}" + condition = var.hpa_type == null ? true : contains(["cpu", "tgi_queue_size", "tgi_batch_current_size", "tgi_batch_current_max_tokens"], var.hpa_type) + error_message = "Allows values for hpa_type are {null, \"cpu\", \"tgi_queue_size\", \"tgi_batch_current_size\", \"tgi_batch_current_max_tokens\"}" } } @@ -115,3 +115,17 @@ variable "hpa_max_replicas" { default = 5 nullable = false } + +# TODO: combine hpa variables into a single object (so that they can be +# validated together) +variable "hpa_averagevalue_target" { + description = "AverageValue target for the `hpa_type` metric. Must be set if `hpa_type` is not null." + type = number + default = null + nullable = true +} + +variable "project_id" { + description = "Project id of existing or created project." + type = string +}