Skip to content

Commit

Permalink
WIP changes/reversions
Browse files Browse the repository at this point in the history
  • Loading branch information
Bslabe123 committed Aug 16, 2024
1 parent 44143e7 commit 91fe581
Show file tree
Hide file tree
Showing 9 changed files with 387 additions and 80 deletions.
8 changes: 6 additions & 2 deletions benchmarks/benchmark/tools/latency-profile/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ locals {
? "${path.module}/manifest-templates"
: pathexpand(var.templates_path)
)
hugging_face_token_secret = (
var.hugging_face_secret == null || var.hugging_face_secret_version == null
? null
: "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}"
)

all_manifests = flatten([for manifest_file in local.templates :
[for data in split("---", templatefile(manifest_file, {
Expand All @@ -38,13 +43,12 @@ locals {
inference_server_framework = var.inference_server_framework
ksa = var.ksa
latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
google_service_account = var.google_service_account
max_num_prompts = var.max_num_prompts
max_output_len = var.max_output_len
max_prompt_len = var.max_prompt_len
request_rates = join(",", [for number in var.request_rates : tostring(number)])
tokenizer = var.tokenizer
hugging_face_token_b64 = var.hugging_face_token_b64
hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
k8s_hf_secret_list = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
output_bucket = var.output_bucket
})) : data]
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,20 @@ spec:
value: ${request_rates}
- name: OUTPUT_BUCKET
value: ${output_bucket}
%{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: HF_TOKEN
%{ endfor ~}
%{ for hf_token in k8s_hf_secret_list ~}
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: HF_TOKEN
%{ endfor ~}
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-l4 # nvidia-h100-80gb, nvidia-l4
iam.gke.io/gke-metadata-server-enabled: "true"

This file was deleted.

This file was deleted.

70 changes: 10 additions & 60 deletions benchmarks/benchmark/tools/latency-profile/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,6 @@ variable "latency_profile_kubernetes_service_account" {
default = "sample-runner-ksa"
}

variable "google_service_account" {
description = "Google Service Account bound to the kubernetes service account"
type = string
default = ""
nullable = false
}

// TODO: add validation to make k8s_hf_secret & hugging_face_secret mutually exclusive once terraform is updated with: https://discuss.hashicorp.com/t/experiment-feedback-input-variable-validation-can-cross-reference-other-objects/66644
variable "k8s_hf_secret" {
description = "Name of secret for huggingface token; stored in k8s "
Expand All @@ -158,59 +151,16 @@ variable "k8s_hf_secret" {
default = null
}

variable "hugging_face_token_b64" {
description = "Base 64 encoded hugging face token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
variable "hugging_face_secret" {
description = "name of the kubectl huggingface secret token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
type = string
nullable = false
nullable = true
default = null
}

variable "pipeline_config" {
description = "All combinations of model/model_server/accelerators to benchmark"
type = object({
valid_models = list(string)
valid_accelerators = list(string)
request_rates = list(number)

config = list(object({
model_server = string # Model server name
model_server_configs = list(object({
models = list(string) # model name
model_configs = list(object({
accelerators = list(string) # Accelerator name
accelerator_configs = list(object({
accelerator_count = number # Number of accelerators
}))
}))
}))
}))
})

validation {
condition = alltrue([
for cfg in var.pipeline_config.config : alltrue([
for model_server_config in cfg.model_server_configs : (
alltrue([
for model_config in model_server_config.model_configs :
alltrue([for accelerator in model_config.accelerators :
contains(var.pipeline_config.valid_accelerators, accelerator)])
])
)
])
])
error_message = "Each accelerator must be in the valid_accelerators list."
}

validation {
condition = alltrue([
for cfg in var.pipeline_config.config : alltrue([
for model_server_config in cfg.model_server_configs : (
alltrue([
for model in model_server_config.models :
contains(var.pipeline_config.valid_models, model)
])
)
])
])
error_message = "Each model must be in the valid_models list."
}
}
variable "hugging_face_secret_version" {
description = "Secret version in Secret Manager"
type = string
nullable = true
default = null
}
39 changes: 39 additions & 0 deletions benchmarks/benchmark/tools/profile-generator/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/**
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

module "latency-profile" {
source = "../latency-profile"

credentials_config = var.credentials_config
namespace = var.namespace
project_id = var.project_id
ksa = var.ksa
templates_path = var.templates_path
artifact_registry = var.artifact_registry
inference_server_service = var.inference_server_service
inference_server_service_port = var.inference_server_service_port
inference_server_framework = var.inference_server_framework
max_num_prompts = var.max_num_prompts
max_output_len = var.max_output_len
max_prompt_len = var.max_prompt_len
request_rates = var.request_rates
tokenizer = var.tokenizer
output_bucket = var.output_bucket
latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
k8s_hf_secret = var.k8s_hf_secret
hugging_face_secret = var.hugging_face_secret
hugging_face_secret_version = var.hugging_face_secret_version
}
105 changes: 105 additions & 0 deletions benchmarks/benchmark/tools/profile-generator/sample.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/**
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

credentials_config = {
kubeconfig = {
path = "~/.kube/config"
}
}

project_id = "tpu-vm-gke-testing"


# Latency profile generator service configuration
artifact_registry = "us-central1-docker.pkg.dev/tpu-vm-gke-testing/ai-benchmark"
inference_server_service = "maxengine-server" # inference server service name
inference_server_service_port = 8000
latency_profile_kubernetes_service_account = "prom-frontend-sa"
output_bucket = "tpu-vm-gke-testing-benchmark-output-bucket"
k8s_hf_secret = "hf-token"

# Benchmark configuration for Locust Docker accessing inference server
inference_server_framework = "jetstream"
tokenizer = "google/gemma-7b"
request_rates = [5, 10, 15, 20]

profiles = {
valid_models = [
"gemma2-2b",
"gemma2-9b",
"gemma2-27b",
"llama3-8b",
"llama3-70b",
"llama3-405b"
]
valid_accelerators = [
"tpu-v4-podslice",
"tpu-v5-lite-podslice",
"tpu-v5p-slice",
"nvidia-a100-80gb",
"nvidia-h100-80gb",
"nvidia-l4"
]
request_rates = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

config = [{
model_server = "Jetstream"
model_server_configs = [{
models = [
"gemma2-2b",
"gemma2-9b",
"gemma2-27b"
]
model_configs = []
}]
}, {
model_server = "vllm"
model_server_configs = [{
models = [
"gemma2-2b",
"gemma2-9b",
"gemma2-27b",
"llama3-8b",
"llama3-70b",
"llama3-405b"
]
model_configs = []
}]
}, {
model_server = "tgi"
model_server_configs = [{
models = [
"gemma2-2b",
"gemma2-9b",
"gemma2-27b",
"llama3-8b",
"llama3-70b",
"llama3-405b"
]
model_configs = []
}]
}, {
model_server = "tensorrt-llm"
model_server_configs = [{
models = [
"llama3-8b",
"llama3-70b",
"llama3-405b"
]
model_configs = []
}]
}]
}
Loading

0 comments on commit 91fe581

Please sign in to comment.