diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf index 3906c6417..be95ee59e 100644 --- a/benchmarks/benchmark/tools/latency-profile/main.tf +++ b/benchmarks/benchmark/tools/latency-profile/main.tf @@ -28,6 +28,11 @@ locals { ? "${path.module}/manifest-templates" : pathexpand(var.templates_path) ) + hugging_face_token_secret = ( + var.hugging_face_secret == null || var.hugging_face_secret_version == null + ? null + : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}" + ) all_manifests = flatten([for manifest_file in local.templates : [for data in split("---", templatefile(manifest_file, { @@ -38,13 +43,12 @@ locals { inference_server_framework = var.inference_server_framework ksa = var.ksa latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account - google_service_account = var.google_service_account max_num_prompts = var.max_num_prompts max_output_len = var.max_output_len max_prompt_len = var.max_prompt_len request_rates = join(",", [for number in var.request_rates : tostring(number)]) tokenizer = var.tokenizer - hugging_face_token_b64 = var.hugging_face_token_b64 + hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret] k8s_hf_secret_list = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret] output_bucket = var.output_bucket })) : data] diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl deleted file mode 100644 index d61629ee4..000000000 --- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: hf-token - namespace: ${namespace} -data: - HF_TOKEN: ${hugging_face_token_b64} \ No newline at end of file diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl index c3fb03b83..72e5773a2 100644 --- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl +++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl @@ -41,11 +41,20 @@ spec: value: ${request_rates} - name: OUTPUT_BUCKET value: ${output_bucket} +%{ for hugging_face_token_secret in hugging_face_token_secret_list ~} - name: HF_TOKEN valueFrom: secretKeyRef: name: hf-token key: HF_TOKEN +%{ endfor ~} +%{ for hf_token in k8s_hf_secret_list ~} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: HF_TOKEN +%{ endfor ~} nodeSelector: cloud.google.com/gke-accelerator: nvidia-l4 # nvidia-h100-80gb, nvidia-l4 iam.gke.io/gke-metadata-server-enabled: "true" \ No newline at end of file diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl deleted file mode 100644 index 95e400737..000000000 --- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: ${namespace} \ No newline at end of file diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl deleted file mode 100644 index 02eff324f..000000000 --- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: ${latency_profile_kubernetes_service_account} - namespace: ${namespace} - annotations: - iam.gke.io/gcp-service-account: "${google_service_account}@tpu-vm-gke-testing.iam.gserviceaccount.com" \ No newline at end of file diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf index 896934cf0..935a5bab8 100644 --- a/benchmarks/benchmark/tools/latency-profile/variables.tf +++ b/benchmarks/benchmark/tools/latency-profile/variables.tf @@ -143,13 +143,6 @@ variable "latency_profile_kubernetes_service_account" { default = "sample-runner-ksa" } -variable "google_service_account" { - description = "Google Service Account bound to the kubernetes service account" - type = string - default = "" - nullable = false -} - // TODO: add validation to make k8s_hf_secret & hugging_face_secret mutually exclusive once terraform is updated with: https://discuss.hashicorp.com/t/experiment-feedback-input-variable-validation-can-cross-reference-other-objects/66644 variable "k8s_hf_secret" { description = "Name of secret for huggingface token; stored in k8s " @@ -158,59 +151,16 @@ variable "k8s_hf_secret" { default = null } -variable "hugging_face_token_b64" { - description = "Base 64 encoded hugging face token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/" +variable "hugging_face_secret" { + description = "name of the kubectl huggingface secret token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/" type = string - nullable = false + nullable = true + default = null } -variable "pipeline_config" { - description = "All combinations of model/model_server/accelerators to benchmark" - type = object({ - valid_models = list(string) - valid_accelerators = list(string) - request_rates = list(number) - - config = list(object({ - model_server = string # Model server name - model_server_configs = list(object({ - models = list(string) # model name - model_configs = list(object({ - accelerators = list(string) # Accelerator name - accelerator_configs = list(object({ - accelerator_count = number # Number of accelerators - })) - })) - })) - })) - }) - - validation { - condition = alltrue([ - for cfg in var.pipeline_config.config : alltrue([ - for model_server_config in cfg.model_server_configs : ( - alltrue([ - for model_config in model_server_config.model_configs : - alltrue([for accelerator in model_config.accelerators : - contains(var.pipeline_config.valid_accelerators, accelerator)]) - ]) - ) - ]) - ]) - error_message = "Each accelerator must be in the valid_accelerators list." - } - - validation { - condition = alltrue([ - for cfg in var.pipeline_config.config : alltrue([ - for model_server_config in cfg.model_server_configs : ( - alltrue([ - for model in model_server_config.models : - contains(var.pipeline_config.valid_models, model) - ]) - ) - ]) - ]) - error_message = "Each model must be in the valid_models list." - } -} \ No newline at end of file +variable "hugging_face_secret_version" { + description = "Secret version in Secret Manager" + type = string + nullable = true + default = null +} diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf index e69de29bb..e8ea340a2 100644 --- a/benchmarks/benchmark/tools/profile-generator/main.tf +++ b/benchmarks/benchmark/tools/profile-generator/main.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +module "latency-profile" { + source = "../latency-profile" + + credentials_config = var.credentials_config + namespace = var.namespace + project_id = var.project_id + ksa = var.ksa + templates_path = var.templates_path + artifact_registry = var.artifact_registry + inference_server_service = var.inference_server_service + inference_server_service_port = var.inference_server_service_port + inference_server_framework = var.inference_server_framework + max_num_prompts = var.max_num_prompts + max_output_len = var.max_output_len + max_prompt_len = var.max_prompt_len + request_rates = var.request_rates + tokenizer = var.tokenizer + output_bucket = var.output_bucket + latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account + k8s_hf_secret = var.k8s_hf_secret + hugging_face_secret = var.hugging_face_secret + hugging_face_secret_version = var.hugging_face_secret_version +} \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/sample.tfvars new file mode 100644 index 000000000..907b76eef --- /dev/null +++ b/benchmarks/benchmark/tools/profile-generator/sample.tfvars @@ -0,0 +1,105 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +credentials_config = { + kubeconfig = { + path = "~/.kube/config" + } +} + +project_id = "tpu-vm-gke-testing" + + +# Latency profile generator service configuration +artifact_registry = "us-central1-docker.pkg.dev/tpu-vm-gke-testing/ai-benchmark" +inference_server_service = "maxengine-server" # inference server service name +inference_server_service_port = 8000 +latency_profile_kubernetes_service_account = "prom-frontend-sa" +output_bucket = "tpu-vm-gke-testing-benchmark-output-bucket" +k8s_hf_secret = "hf-token" + +# Benchmark configuration for Locust Docker accessing inference server +inference_server_framework = "jetstream" +tokenizer = "google/gemma-7b" +request_rates = [5, 10, 15, 20] + +profiles = { + valid_models = [ + "gemma2-2b", + "gemma2-9b", + "gemma2-27b", + "llama3-8b", + "llama3-70b", + "llama3-405b" + ] + valid_accelerators = [ + "tpu-v4-podslice", + "tpu-v5-lite-podslice", + "tpu-v5p-slice", + "nvidia-a100-80gb", + "nvidia-h100-80gb", + "nvidia-l4" + ] + request_rates = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] + + config = [{ + model_server = "Jetstream" + model_server_configs = [{ + models = [ + "gemma2-2b", + "gemma2-9b", + "gemma2-27b" + ] + model_configs = [] + }] + }, { + model_server = "vllm" + model_server_configs = [{ + models = [ + "gemma2-2b", + "gemma2-9b", + "gemma2-27b", + "llama3-8b", + "llama3-70b", + "llama3-405b" + ] + model_configs = [] + }] + }, { + model_server = "tgi" + model_server_configs = [{ + models = [ + "gemma2-2b", + "gemma2-9b", + "gemma2-27b", + "llama3-8b", + "llama3-70b", + "llama3-405b" + ] + model_configs = [] + }] + }, { + model_server = "tensorrt-llm" + model_server_configs = [{ + models = [ + "llama3-8b", + "llama3-70b", + "llama3-405b" + ] + model_configs = [] + }] + }] +} \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf index e69de29bb..dbbfddc1a 100644 --- a/benchmarks/benchmark/tools/profile-generator/variables.tf +++ b/benchmarks/benchmark/tools/profile-generator/variables.tf @@ -0,0 +1,218 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "credentials_config" { + description = "Configure how Terraform authenticates to the cluster." + type = object({ + fleet_host = optional(string) + kubeconfig = optional(object({ + context = optional(string) + path = optional(string, "~/.kube/config") + })) + }) + nullable = false + validation { + condition = ( + (var.credentials_config.fleet_host != null) != + (var.credentials_config.kubeconfig != null) + ) + error_message = "Exactly one of fleet host or kubeconfig must be set." + } +} + +variable "namespace" { + description = "Namespace used for model and benchmarking deployments." + type = string + nullable = false + default = "default" +} + +variable "project_id" { + description = "Project id of existing or created project." + type = string + nullable = false +} + + +variable "ksa" { + description = "Kubernetes Service Account used for workload." + type = string + nullable = false + default = "default" +} + +variable "templates_path" { + description = "Path where manifest templates will be read from. Set to null to use the default manifests" + type = string + default = null +} + +variable "artifact_registry" { + description = "Artifact registry for storing Latency Profile Generator container." + type = string + default = null +} + +variable "inference_server_service" { + description = "Inference server service" + type = string + nullable = false +} + +variable "inference_server_service_port" { + description = "Inference server service port" + type = number + nullable = false +} + +variable "inference_server_framework" { + description = "Benchmark server configuration for inference server framework. Can be one of: vllm, tgi, tensorrt_llm_triton, sax" + type = string + nullable = false + default = "tgi" + validation { + condition = var.inference_server_framework == "vllm" || var.inference_server_framework == "tgi" || var.inference_server_framework == "tensorrt_llm_triton" || var.inference_server_framework == "sax" || var.inference_server_framework == "jetstream" + error_message = "The inference_server_framework must be one of: vllm, tgi, tensorrt_llm_triton, sax, or jetstream." + } +} + +variable "max_num_prompts" { + description = "Benchmark server configuration for max number of prompts." + type = number + default = 1000 + validation { + condition = var.max_num_prompts > 0 + error_message = "The max_num_prompts value must be greater than 0." + } +} + +variable "max_output_len" { + description = "Benchmark server configuration for max output length." + type = number + default = 256 + validation { + condition = var.max_output_len > 4 + error_message = "The max_output_len value must be greater than 4. TGI framework throws an error for too short of sequences." + } +} + +variable "max_prompt_len" { + description = "Benchmark server configuration for max prompt length." + type = number + default = 256 + validation { + condition = var.max_prompt_len > 4 + error_message = "The max_prompt_len value must be greater than 4. TGI framework throws an error for too short of sequences." + } +} + +variable "request_rates" { + description = "" + type = list(number) + default = [1, 2] + nullable = false +} + +variable "tokenizer" { + description = "Benchmark server configuration for tokenizer." + type = string + nullable = false + default = "tiiuae/falcon-7b" +} + +variable "output_bucket" { + description = "Bucket name for storing results" + type = string +} + +variable "latency_profile_kubernetes_service_account" { + description = "Kubernetes Service Account to be used for the latency profile generator tool" + type = string + default = "sample-runner-ksa" +} + +// TODO: add validation to make k8s_hf_secret & hugging_face_secret mutually exclusive once terraform is updated with: https://discuss.hashicorp.com/t/experiment-feedback-input-variable-validation-can-cross-reference-other-objects/66644 +variable "k8s_hf_secret" { + description = "Name of secret for huggingface token; stored in k8s " + type = string + nullable = true + default = null +} + +variable "hugging_face_secret" { + description = "name of the kubectl huggingface secret token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/" + type = string + nullable = true + default = null +} + +variable "hugging_face_secret_version" { + description = "Secret version in Secret Manager" + type = string + nullable = true + default = null +} + +variable "profiles" { + description = "Model servers to benchmark" + type = object({ + valid_models = list(string) + valid_accelerators = list(string) + request_rates = list(number) + + config = list(object({ + model_server = string # Model server name + model_server_configs = list(object({ + models = list(string) # model name + model_configs = list(object({ + accelerators = list(string) # Accelerator name + accelerator_configs = list(object({ + accelerator_count = number # Number of accelerators + })) + })) + })) + })) + }) + + validation { + condition = alltrue([ + for cfg in var.profiles.config : alltrue([ + for model_server_config in cfg.model_server_configs : ( + alltrue([ + for model_config in model_server_config.model_configs : + alltrue([for accelerator in model_config.accelerators : + contains(var.profiles.valid_accelerators, accelerator)]) + ]) + ) + ]) + ]) + error_message = "Each accelerator must be in the valid_accelerators list." + } + + validation { + condition = alltrue([ + for cfg in var.profiles.config : alltrue([ + for model_server_config in cfg.model_server_configs : ( + alltrue([ + for model in model_server_config.models : + contains(var.profiles.valid_models, model) + ]) + ) + ]) + ]) + error_message = "Each model must be in the valid_models list." + } +} \ No newline at end of file