Skip to content

Commit

Permalink
intermediate changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Bslabe123 committed Aug 20, 2024
1 parent f4e76b2 commit ccc512f
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 70 deletions.
15 changes: 15 additions & 0 deletions benchmarks/benchmark/tools/latency-profile/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ locals {

all_manifests = flatten([for manifest_file in local.templates :
[for data in split("---", templatefile(manifest_file, {
combo = format("%s-%s-%s-%s", var.inference_server.name, var.inference_server.model, var.inference_server.accelerator_config.type, var.inference_server.accelerator_config.count)
artifact_registry = var.artifact_registry
namespace = var.namespace
inference_server_framework = var.inference_server.name
Expand All @@ -50,6 +51,20 @@ locals {
])
}

terraform {
required_providers {
kubernetes = {
source = "hashicorp/kubernetes"
version = ">= 2.0"
}
}
}

data "google_client_config" "identity" {
count = var.credentials_config.fleet_host != null ? 1 : 0
}


resource "google_project_service" "cloudbuild" {
count = var.build_latency_profile_generator_image ? 1 : 0
project = var.project_id
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: batch/v1
kind: Job
metadata:
name: latency-profile-generator
name: lpg-${combo}
namespace: ${namespace}
labels:
name: latency-profile-generator
Expand Down
36 changes: 0 additions & 36 deletions benchmarks/benchmark/tools/latency-profile/providers.tf

This file was deleted.

5 changes: 5 additions & 0 deletions benchmarks/benchmark/tools/latency-profile/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,16 @@ variable "inference_server" {
type = object({
deploy = optional(bool), # Do you want this module to deploy the model server?
name = string,
model = string,
tokenizer = string,
service = object({
name = string,
port = number,
})
accelerator_config = object({
type = string,
count = number,
})
})
nullable = false

Expand Down
52 changes: 51 additions & 1 deletion benchmarks/benchmark/tools/profile-generator/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,27 @@
* limitations under the License.
*/

provider "kubernetes" {
config_path = (
var.credentials_config.kubeconfig == null
? null
: pathexpand(var.credentials_config.kubeconfig.path)
)
config_context = try(
var.credentials_config.kubeconfig.context, null
)
host = (
var.credentials_config.fleet_host == null
? null
: var.credentials_config.fleet_host
)
token = try(data.google_client_config.identity.0.access_token, null)
}

data "google_client_config" "identity" {
count = var.credentials_config.fleet_host != null ? 1 : 0
}



resource "google_project_service" "cloudbuild" {
Expand All @@ -32,6 +53,22 @@ resource "google_project_service" "cloudbuild" {
# CREATE NODEPOOLS

module "latency-profile" {
for_each = toset(
flatten([
for config in toset(var.profiles.config): toset([
for model_server_config in toset(config.model_server_configs): toset([
for model in toset(model_server_config.models): toset([
for model_config in toset(model_server_config.model_configs): toset([
for accelerator in toset(model_config.accelerators): toset([
for accelerator_config in toset(model_config.accelerator_configs):
join(" ", [model, config.model_server, accelerator, accelerator_config.accelerator_count])
])
])
])
])
])
])
)
source = "../latency-profile"

credentials_config = var.credentials_config
Expand All @@ -41,7 +78,20 @@ module "latency-profile" {
templates_path = var.templates_path
artifact_registry = var.artifact_registry
build_latency_profile_generator_image = false # Dont build image for each profile generator instance, only need to do once.
inference_server = var.inference_server
inference_server = {
deploy = true
name = split(" ", each.value)[1]
model = split(" ", each.value)[0]
tokenizer = "google/gemma-7b"
service = {
name = "maxengine-server", # inference server service name
port = 8000
}
accelerator_config = {
type = split(" ", each.value)[2]
count = split(" ", each.value)[3]
}
}
max_num_prompts = var.max_num_prompts
max_output_len = var.max_output_len
max_prompt_len = var.max_prompt_len
Expand Down
23 changes: 9 additions & 14 deletions benchmarks/benchmark/tools/profile-generator/sample.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,13 @@ credentials_config = {

project_id = "tpu-vm-gke-testing"


# Latency profile generator service configuration
artifact_registry = "us-central1-docker.pkg.dev/tpu-vm-gke-testing/ai-benchmark"
build_latency_profile_generator_image = false
latency_profile_kubernetes_service_account = "prom-frontend-sa"
output_bucket = "tpu-vm-gke-testing-benchmark-output-bucket"
k8s_hf_secret = "hf-token"

# Inference server configuration
inference_server = {
deploy = false
name = "jetstream"
tokenizer = "google/gemma-7b"
service = {
name = "maxengine-server", # inference server service name
port = 8000
}
}

# Benchmark configuration for Locust Docker accessing inference server
request_rates = [5, 10, 15, 20]

Expand All @@ -64,14 +52,21 @@ profiles = {
request_rates = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

config = [{
model_server = "Jetstream"
model_server = "jetstream"
model_server_configs = [{
models = [
"gemma2-2b",
"gemma2-9b",
"gemma2-27b"
]
model_configs = []
model_configs = [{
accelerators = [
"tpu-v5-lite-podslice",
]
accelerator_configs = [{
accelerator_count = 1
}]
}]
}]
}, {
model_server = "vllm"
Expand Down
18 changes: 0 additions & 18 deletions benchmarks/benchmark/tools/profile-generator/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -71,24 +71,6 @@ variable "build_latency_profile_generator_image" {
default = true
}

variable "inference_server" {
type = object({
deploy = optional(bool), # Do you want this module to deploy the model server?
name = string,
tokenizer = string, # Benchmark server configuration for tokenizer
service = object({
name = string,
port = number,
})
})
nullable = false

validation {
condition = var.inference_server.name == "vllm" || var.inference_server.name == "tgi" || var.inference_server.name == "tensorrt_llm_triton" || var.inference_server.name == "sax" || var.inference_server.name == "jetstream"
error_message = "The inference_server_framework must be one of: vllm, tgi, tensorrt_llm_triton, sax, or jetstream."
}
}

variable "max_num_prompts" {
description = "Benchmark server configuration for max number of prompts."
type = number
Expand Down

0 comments on commit ccc512f

Please sign in to comment.