From a6677e5315efea95b748eb62bd6e75657bc4e132 Mon Sep 17 00:00:00 2001
From: Ashok Chandrasekar <achandrasekar@google.com>
Date: Mon, 12 Aug 2024 06:46:34 +0000
Subject: [PATCH 01/37] Add latency profile generator

This change adds a new benchmarking suite called
latency-profile-generator which runs serving benchmarks at different
request rates to produce latency and throughput numbers at different
QPS. This can be used to identify how different models and model servers
perform depending on incoming traffic.
---
 .../benchmark/tools/latency-profile/README.md | 209 ++++++++
 .../benchmark/tools/latency-profile/build.tf  |   8 +
 .../latency-profile/container/Dockerfile      |  22 +
 .../container/benchmark_serving.py            | 460 ++++++++++++++++++
 .../container/latency_throughput_curve.sh     |  30 ++
 .../container/requirements.txt                |  37 ++
 .../benchmark/tools/latency-profile/main.tf   |  71 +++
 .../latency-profile-generator.yaml.tpl        |  43 ++
 .../tools/latency-profile/providers.tf        |  36 ++
 .../tools/latency-profile/sample.tfvars       |  21 +
 .../tools/latency-profile/variables.tf        | 153 ++++++
 11 files changed, 1090 insertions(+)
 create mode 100644 benchmarks/benchmark/tools/latency-profile/README.md
 create mode 100644 benchmarks/benchmark/tools/latency-profile/build.tf
 create mode 100644 benchmarks/benchmark/tools/latency-profile/container/Dockerfile
 create mode 100644 benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py
 create mode 100755 benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
 create mode 100644 benchmarks/benchmark/tools/latency-profile/container/requirements.txt
 create mode 100644 benchmarks/benchmark/tools/latency-profile/main.tf
 create mode 100644 benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
 create mode 100644 benchmarks/benchmark/tools/latency-profile/providers.tf
 create mode 100644 benchmarks/benchmark/tools/latency-profile/sample.tfvars
 create mode 100644 benchmarks/benchmark/tools/latency-profile/variables.tf

diff --git a/benchmarks/benchmark/tools/latency-profile/README.md b/benchmarks/benchmark/tools/latency-profile/README.md
new file mode 100644
index 000000000..cdaaa31aa
--- /dev/null
+++ b/benchmarks/benchmark/tools/latency-profile/README.md
@@ -0,0 +1,209 @@
+# AI on GKE Benchmark Locust
+
+<!-- BEGIN TOC -->
+- [AI on GKE Benchmark Locust](#ai-on-gke-benchmark-locust)
+  - [Overview](#overview)
+  - [Instructions](#instructions)
+    - [Step 1: prepare benchmark prompts](#step-1-prepare-benchmark-prompts)
+    - [Step 2: create and give service account access to view dataset](#step-2-create-and-give-service-account-access-to-view-dataset)
+    - [Step 3: create output bucket](#step-3-create-output-bucket)
+    - [Step 4: create and give service account access to write to output gcs bucket](#step-4-create-and-give-service-account-access-to-write-to-output-gcs-bucket)
+    - [Step 5: create artifact repository for automated Locust docker build](#step-5-create-artifact-repository-for-automated-locust-docker-build)
+    - [Step 6: create and configure terraform.tfvars](#step-6-create-and-configure-terraformtfvars)
+      - [\[optional\] set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig)
+      - [\[optional\] set up secret token in Secret Manager](#optional-set-up-secret-token-in-secret-manager)
+    - [Step 7: login to gcloud](#step-7-login-to-gcloud)
+    - [Step 8: terraform initialize, plan and apply](#step-8-terraform-initialize-plan-and-apply)
+    - [Step 9: start an end to end benchmark](#step-9-start-an-end-to-end-benchmark)
+      - [option 1: initiate a single end to end Locust benchmark run via curl command](#option-1-initiate-a-single-end-to-end-locust-benchmark-run-via-curl-command)
+      - [option 2: interactive benchmark with locust web ui](#option-2-interactive-benchmark-with-locust-web-ui)
+    - [Step 10: viewing metrics](#step-10-viewing-metrics)
+    - [Additional Tips](#additional-tips)
+  - [Variables](#variables)
+<!-- END TOC -->
+
+## Overview
+
+This deploys the latency profile generator which measures the throuhghput and
+latency at various request rates for the model and model server of your choice. 
+
+It currently supports the following frameworks:
+- tensorrt_llm_triton
+- text generation inference (tgi)
+- vllm
+- sax
+
+## Instructions
+
+### Step 1: create output bucket
+
+If you followed steps from `../../infra/` for creating your cluster and extra
+resources, you will already have an output bucket created for you.
+If not, you will have to create and manage your own gcs bucket for storing
+benchmarking results.
+
+Set the `output_bucket` in your `terraform.tfvars` to this gcs bucket.
+
+### Step 2: create and give service account access to write to output gcs bucket
+
+The Latency profile generator requires storage.admin access to write output to
+the given output gcs bucket. If you followed steps in `../../infra`, then you
+already have a kubernetes and gcloud service account created that has the proper
+access to the created output bucket.
+
+To give viewer permissions on the gcs bucket to the gcloud service account,
+run the following:
+
+```
+gcloud storage buckets add-iam-policy-binding  gs://$OUTPUT_BUCKET/
+--member=serviceAccount:$GOOGLE_SERVICE_ACCOUNT@$PROJECT_ID.iam.gserviceaccount.com --role=roles/storage.admin
+```
+
+Your kubernetes service account will inherit the reader permissions.
+
+You will set the `lantency_profile_kubernetes_service_account` in your
+`terraform.tfvars` to the kubernetes service account name.
+
+### Step 5: create artifact repository for automated Locust docker build
+
+The latency profile generator rebuilds the docker file on each terraform apply.
+The containers will be pushed to the given `artifact_registry`. This artifact
+repository is expected to already exist. If you created your cluster via
+`../../infra/`, then an artifact repository was created for you with the same
+name as the prefix in the same location as the cluster. You can also create your
+own via this command:
+
+```bash
+gcloud artifacts repositories create ai-benchmark --location=us-central1 --repository-format=docker
+```
+
+
+### Step 6: create and configure terraform.tfvars
+
+Create a `terraform.tfvars` file. `./sample-tfvars/tgi-sample.tfvars` is
+provided as an example file. You can copy the file as a starting point.
+Note that at a minimum you will have to change the existing
+`credentials_config`, `project_id`, and `artifact_registry`.
+
+```bash
+cp ./sample-tfvars/tgi-sample.tfvars terraform.tfvars
+```
+
+Fill out your `terraform.tfvars` with the desired model and server configuration, referring to the list of required and optional variables [here](#variables). The following variables are required:
+- `credentials_config` - credentials for cluster to deploy Locust benchmark tool on
+- `project_id` - project id for enabling dependent services for building locust artifacts
+- `artifact_registry` - artifact registry to upload locust artifacts to
+- `inference_server_service` - an accessible service name for inference workload to be benchmarked **(Note: If you are using a non-80 port for your model server service, it should be specified here. Example: `my-service-name:9000`)**
+- `tokenizer` - must match the model running on the inference workload to be benchmarked
+- `inference_server_framework` - the inference workload framework
+- `output_bucket` - gcs bucket to write benchmarking metrics to.
+- `latency_profile_kubernetes_service_account` - service account giving access to latency profile generator to write to `output_bucket`
+
+#### [optional] set-up credentials config with kubeconfig
+
+If your cluster has fleet management enabled, the existing `credentials_config`
+can use the fleet host credentials like this:
+
+```bash
+credentials_config = {
+  fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/$CLUSTER_NAME"
+}
+```
+
+If your cluster does not have fleet management enabled, you can use your
+cluster's kubeconfig in the `credentials_config`. You must isolate your
+cluster's kubeconfig from other clusters in the default kube.config file.
+To do this, run the following command:
+
+```bash
+KUBECONFIG=~/.kube/${CLUSTER_NAME}-kube.config gcloud container clusters get-credentials $CLUSTER_NAME --location $CLUSTER_LOCATION
+```
+
+Then update your `terraform.tfvars` `credentials_config` to the following:
+
+```bash
+credentials_config = {
+  kubeconfig = {
+    path = "~/.kube/${CLUSTER_NAME}-kube.config"
+  }
+}
+```
+
+#### [optional] set up secret token in Secret Manager
+
+A model may require a security token to access it. For example, Llama2 from
+HuggingFace is a gated model that requires a
+[user access token](https://huggingface.co/docs/hub/en/security-tokens). If the
+model you want to run does not require this, skip this step.
+
+If you followed steps from `.../../infra/`, Secret Manager and the user access
+token should already be set up. If not, it is strongly recommended that you use
+Workload Identity and Secret Manager to access the user access tokens to avoid
+adding a plain text token into the terraform state. To do so, follow the
+instructions for
+[setting up a secret in Secret Manager here](https://cloud.google.com/kubernetes-engine/docs/tutorials/workload-identity-secrets).
+
+Once complete, you should add these related secret values to your
+`terraform.tfvars`:
+
+```bash
+# ex. "projects/sample-project/secrets/hugging_face_secret"
+hugging_face_secret = $SECRET_ID
+ # ex. 1
+hugging_face_secret_version =  $SECRET_VERSION
+```
+
+### Step 7: login to gcloud
+
+Run the following gcloud command for authorization:
+
+```bash
+gcloud auth application-default login
+```
+
+### Step 8: terraform initialize, plan and apply
+
+Run the following terraform commands:
+
+```bash
+# initialize terraform
+terraform init
+
+# verify changes
+terraform plan
+
+# apply changes
+terraform apply
+```
+
+A results file will appear in GCS bucket specified as `output_bucket` in input
+variables.
+
+<!-- BEGIN_TF_DOCS -->
+
+## Variables
+
+| Name                                                                                                                 | Description                                                                                                                 | Type                                                                                                                                                                                                        | Default              | Required |
+| -------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------- | :------: |
+| <a name="input_artifact_registry"></a> [artifact\_registry](#input\_artifact\_registry)                              | Artifact registry for storing Locust container                                                                              | `string`                                                                                                                                                                                                    | `null`               |   yes    |
+| <a name="input_best_of"></a> [best\_of](#input\_best\_of)                                                            | Benchmark server configuration for best of.                                                                                 | `number`                                                                                                                                                                                                    | `1`                  |    no    |
+| <a name="input_credentials_config"></a> [credentials\_config](#input\_credentials\_config)                           | Configure how Terraform authenticates to the cluster.                                                                       | <pre>object({<br>    fleet_host = optional(string)<br>    kubeconfig = optional(object({<br>      context = optional(string)<br>      path    = optional(string, "~/.kube/config")<br>    }))<br>  })</pre> | n/a                  |   yes    |
+| <a name="input_gcs_path"></a> [gcs\_path](#input\_gcs\_path)                                                         | Benchmark server configuration for gcs\_path for downloading prompts.                                                       | `string`                                                                                                                                                                                                    | n/a                  |   yes    |
+| <a name="input_inference_server_framework"></a> [inference\_server\_framework](#input\_inference\_server\_framework) | Benchmark server configuration for inference server framework. Can be one of: vllm, tgi, tensorrt\_llm\_triton, sax, or jetstream  | `string`                                                                                                                                                                                                    | `"tgi"`              |   yes    |
+| <a name="input_request_type"></a> [request\_type](#input\_request\_type) | Protocol to use when making requests to the model server. Can be `grpc` or `http` | `string`                                                                                                                                                                                                    | `"http"`              |   no    |
+| <a name="input_inference_server_ip"></a> [inference\_server\_ip](#input\_inference\_server\_ip)                      | Inference server ip address                                                                                                 | `string`                                                                                                                                                                                                    | n/a                  |   yes    |
+| <a name="input_ksa"></a> [ksa](#input\_ksa)                                                                          | Kubernetes Service Account used for workload.                                                                               | `string`                                                                                                                                                                                                    | `"default"`          |    no    |
+| <a name="lantency_profile_kubernetes_service_account"></a> [locust\_runner\_kubernetes\_service\_account](#locust\_runner\_kubernetes\_service\_account)                                                                          | "Kubernetes Service Account to be used for Locust runner tool. Must have storage.admin access to output_bucket"                                                                              | `string`                                                                                                                                                                                                    | `"sample-runner-ksa"`          |    no    |
+| <a name="input_output_bucket"></a> [output\_bucket](#output\_bucket)                                                                          | "Bucket name for storing results"                                                                              | `string`                                                                                                                                                                                                    | n/a          |    yes    |
+| <a name="input_max_num_prompts"></a> [max\_num\_prompts](#input\_max\_num\_prompts)                                  | Benchmark server configuration for max number of prompts.                                                                   | `number`                                                                                                                                                                                                    | `1000`               |    no    |
+| <a name="input_max_output_len"></a> [max\_output\_len](#input\_max\_output\_len)                                     | Benchmark server configuration for max output length.                                                                       | `number`                                                                                                                                                                                                    | `1024`               |    no    |
+| <a name="input_max_prompt_len"></a> [max\_prompt\_len](#input\_max\_prompt\_len)                                     | Benchmark server configuration for max prompt length.                                                                       | `number`                                                                                                                                                                                                    | `1024`               |    no    |
+| <a name="input_num_locust_workers"></a> [num\_locust\_workers](#input\num\_locust\_workers)                          | Number of locust worker pods to deploy.                                                                                     | `number`                                                                                                                                                                                                    | `1`                  |    no    |
+| <a name="input_namespace"></a> [namespace](#input\_namespace)                                                        | Namespace used for model and benchmarking deployments.                                                                      | `string`                                                                                                                                                                                                    | `"default"`          |    no    |
+| <a name="input_project_id"></a> [project\_id](#input\_project\_id)                                                      | Project id of existing or created project.                                           | `string` | n/a                  |   yes    |
+| <a name="input_sax_model"></a> [sax\_model](#input\_sax\_model)                                                      | Benchmark server configuration for sax model. Only required if framework is sax.                                            | `string`                                                                                                                                                                                                    | `""`                 |    no    |
+| <a name="input_tokenizer"></a> [tokenizer](#input\_tokenizer)                                                        | Benchmark server configuration for tokenizer.                                                                               | `string`                                                                                                                                                                                                    | `"tiiuae/falcon-7b"` |   yes    |
+| <a name="input_use_beam_search"></a> [use\_beam\_search](#input\_use\_beam\_search)                                  | Benchmark server configuration for use beam search.                                                                         | `bool`                                                                                                                                                                                                      | `false`              |    no    |
+  <a name="huggingface_secret"></a> [huggingface_secret](#input\_huggingface_secret)                                  | Name of the secret holding the huggingface token. Stored in GCP Secrets Manager.                                                                          | `string`                                                                                                                                                                                                      | `huggingface-secret`              |    no   |
+  <a name="k8s_hf_secret"></a> [k8s_hf_secret](#input\_huggingface_secret)                                  | Name of the secret holding the huggingface token. Stored in K8s. Key is expected to be named: `HF_TOKEN`. See [here](https://kubernetes.io/docs/tasks/configmap-secret/managing-secret-using-kubectl/#use-raw-data) for more.                                                                          | `string`                                                                                                                                                                                                      | `huggingface-secret`              |    no   |
+<!-- END_TF_DOCS -->
diff --git a/benchmarks/benchmark/tools/latency-profile/build.tf b/benchmarks/benchmark/tools/latency-profile/build.tf
new file mode 100644
index 000000000..784cf09f7
--- /dev/null
+++ b/benchmarks/benchmark/tools/latency-profile/build.tf
@@ -0,0 +1,8 @@
+resource "null_resource" "build_and_push_image" {
+
+  depends_on = [resource.google_project_service.cloudbuild]
+  provisioner "local-exec" {
+    working_dir = path.module
+    command     = "gcloud builds submit --tag ${var.artifact_registry}/latency-profile:latest container"
+  }
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/container/Dockerfile b/benchmarks/benchmark/tools/latency-profile/container/Dockerfile
new file mode 100644
index 000000000..4c656a0b7
--- /dev/null
+++ b/benchmarks/benchmark/tools/latency-profile/container/Dockerfile
@@ -0,0 +1,22 @@
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
+
+RUN apt-get update -y \
+    && apt-get install -y python3-pip git vim curl wget
+RUN pip3 install --upgrade pip
+RUN pip install packaging torch transformers
+WORKDIR /workspace
+
+# install build and runtime dependencies
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+
+RUN pip install -U "huggingface_hub[cli]"
+RUN pip install gsutil
+
+RUN wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+COPY benchmark_serving.py benchmark_serving.py
+COPY latency_throughput_curve.sh latency_throughput_curve.sh
+
+RUN chmod +x latency_throughput_curve.sh
+RUN chmod +x benchmark_serving.py
diff --git a/benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py b/benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py
new file mode 100644
index 000000000..19332dbf9
--- /dev/null
+++ b/benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py
@@ -0,0 +1,460 @@
+r"""Benchmark LLM serving throughput and latency.
+
+This script is for sending requests with prompts to LLM server and benchmark
+the latency and throughput at various request rates. It is a modified version of
+https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_serving.py.
+It currently supports TGI, vLLM, Triton TensorRT-LLM and Saxml.
+"""
+
+import argparse
+import asyncio
+import json
+import random
+import time
+from typing import AsyncGenerator, List, Tuple
+
+import aiohttp
+import numpy as np
+from transformers import AutoTokenizer
+from transformers import PreTrainedTokenizerBase
+
+
+# (prompt len, output len, latency)
+REQUEST_LATENCY: List[Tuple[int, int, float]] = []
+
+MIN_SEQ_LEN = 4
+CLIENT_TIMEOUT_SEC = 3 * 60 * 60
+NEW_TEXT_KEY = "\nOutput:\n"
+
+
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    max_input_len: int,
+    max_output_len: int,
+    tokenizer: PreTrainedTokenizerBase,
+    use_dummy_text: bool,
+) -> List[Tuple[str, int, int]]:
+  """Samples requests from the dataset or creates dummy requests."""
+  if use_dummy_text:
+    dummy_prompt_token_ids = [0] * max_input_len
+    dummy_prompt = tokenizer.decode(dummy_prompt_token_ids)
+    dummy_requests = [(
+        dummy_prompt,
+        max_input_len,
+        max_output_len,
+    )] * num_requests
+    return dummy_requests
+
+  # Load the dataset.
+  with open(dataset_path) as f:
+    dataset = json.load(f)
+  # Filter out the conversations with less than 2 turns.
+  dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+  # Only keep the first two turns of each conversation.
+  dataset = [
+      (data["conversations"][0]["value"], data["conversations"][1]["value"])
+      for data in dataset
+  ]
+
+  # Tokenize the prompts and completions.
+  prompts = [prompt for prompt, _ in dataset]
+  prompt_token_ids = tokenizer(prompts).input_ids
+  completions = [completion for _, completion in dataset]
+  completion_token_ids = tokenizer(completions).input_ids
+  tokenized_dataset = []
+  for i in range(len(dataset)):
+    output_len = len(completion_token_ids[i])
+    tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+
+  # Filter out too long sequences.
+  filtered_dataset: List[Tuple[str, int, int]] = []
+  for prompt, prompt_token_ids, output_len in tokenized_dataset:
+    prompt_len = len(prompt_token_ids)
+    if prompt_len < MIN_SEQ_LEN or output_len < MIN_SEQ_LEN:
+      # Prune too short sequences.
+      # This is because TGI causes errors when the input or output length
+      # is too short.
+      continue
+    if prompt_len > max_input_len or output_len > max_output_len:
+      # Prune too long sequences.
+      continue
+    filtered_dataset.append((prompt, prompt_len, output_len))
+
+  # Sample the requests.
+  sampled_requests = random.sample(filtered_dataset, num_requests)
+  return sampled_requests
+
+
+async def get_request(
+    input_requests: List[Tuple[str, int, int]],
+    request_rate: float,
+) -> AsyncGenerator[Tuple[str, int, int], None]:
+  """Gets request async."""
+  input_requests = iter(input_requests)
+  for request in input_requests:
+    yield request
+
+    if request_rate == float("inf"):
+      # If the request rate is infinity, then we don't need to wait.
+      continue
+    # Sample the request interval from the exponential distribution.
+    interval = np.random.exponential(1.0 / request_rate)
+    # The next request will be sent after the interval.
+    await asyncio.sleep(interval)
+
+
+async def send_request(
+    backend: str,
+    api_url: str,
+    prompt: str,
+    prompt_len: int,
+    output_len: int,
+    best_of: int,
+    use_beam_search: bool,
+    top_k: int,
+    tokenizer: PreTrainedTokenizerBase,
+    sax_model: str,
+) -> None:
+  """Sends request to server."""
+  request_start_time = time.time()
+
+  headers = {"User-Agent": "Benchmark Client"}
+  if backend == "vllm":
+    pload = {
+        "prompt": prompt,
+        "n": 1,
+        "best_of": best_of,
+        "use_beam_search": use_beam_search,
+        "temperature": 0.0 if use_beam_search else 1.0,
+        "top_p": 1.0,
+        "max_tokens": output_len,
+        "ignore_eos": False,
+        "stream": False,
+    }
+  elif backend == "tgi":
+    assert not use_beam_search
+    params = {
+        "best_of": best_of,
+        "max_new_tokens": output_len,
+        "do_sample": True,
+    }
+    pload = {
+        "inputs": prompt,
+        "parameters": params,
+    }
+  elif backend == "naive_transformers":
+    # If max_length or top_k is not specified _MAX_LENGTH_DEFAULT = 200 and
+    # _TOP_K_DEFAULT = 10 in peft/handler.py will be used.
+    pload = {
+        "instances": [{
+            "prompt": prompt,
+            "max_length": output_len,
+            "top_k": top_k,
+        }]
+    }
+  elif backend == "tensorrt_llm_triton":
+    pload = {
+        "text_input": prompt,
+        "max_tokens": output_len,
+        "beam_width": 1 if not use_beam_search else best_of,
+        "temperature": 0.0 if use_beam_search else 1.0,
+        "top_p": 1.0,
+        "bad_words": "",
+        "stop_words": "",
+        "stream": False,
+    }
+  elif backend == "sax":
+    pload = {
+        "model": sax_model,
+        "prompt": prompt,
+        "n": 1,
+        "best_of": best_of,
+        "use_beam_search": use_beam_search,
+        "temperature": 0.0 if use_beam_search else 1.0,
+        "top_p": 1.0,
+        "top_k": 50,
+        "max_tokens": output_len,
+        "stream": False,
+    }
+  else:
+    raise ValueError(f"Unknown backend: {backend}")
+
+  # Set client timeout to be 3 hrs.
+  timeout = aiohttp.ClientTimeout(total=CLIENT_TIMEOUT_SEC)
+  async with aiohttp.ClientSession(timeout=timeout) as session:
+    while True:
+      async with session.post(api_url, headers=headers, json=pload) as response:
+        chunks = []
+        async for chunk, _ in response.content.iter_chunks():
+          chunks.append(chunk)
+      output = b"".join(chunks).decode("utf-8")
+      output = json.loads(output)
+
+      # Re-send the request if it failed.
+      if "error" not in output:
+        break
+
+  request_end_time = time.time()
+  # Naive HF transformers generation and TensorRT-LLM generation stops at EOS
+  # tokens and the generation may be shorter than the ground-truth output
+  # sequence length.
+  if backend == "naive_transformers":
+    complete_pred = output["predictions"][0][0]["generated_text"]
+    new_text_start_index = complete_pred.find(NEW_TEXT_KEY) + len(NEW_TEXT_KEY)
+    pred = complete_pred[new_text_start_index:]
+    output_token_ids = tokenizer(pred).input_ids
+    output_len = len(output_token_ids) - prompt_len
+  elif backend == "tensorrt_llm_triton":
+    output_token_ids = tokenizer(output["text_output"]).input_ids
+    output_len = len(output_token_ids)
+  elif backend == "sax":
+    output_token_ids = tokenizer(output["choices"][0]["text"]).input_ids
+    output_len = len(output_token_ids)
+  elif backend == "tgi":
+    output_token_ids = tokenizer(output["generated_text"]).input_ids
+    output_len = len(output_token_ids)
+  elif backend == "vllm":
+    total_token_ids = tokenizer(output["text"][0]).input_ids
+    new_total_len = len(total_token_ids)
+    output_len = new_total_len - prompt_len
+
+  request_latency = request_end_time - request_start_time
+  REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    input_requests: List[Tuple[str, int, int]],
+    best_of: int,
+    use_beam_search: bool,
+    request_rate: float,
+    top_k: int,
+    tokenizer: PreTrainedTokenizerBase,
+    sax_model: str,
+) -> None:
+  """Runs benchmark with asynchronous requests."""
+  tasks: List[asyncio.Task] = []
+  async for request in get_request(input_requests, request_rate):
+    prompt, prompt_len, output_len = request
+    task = asyncio.create_task(
+        send_request(
+            backend,
+            api_url,
+            prompt,
+            prompt_len,
+            output_len,
+            best_of,
+            use_beam_search,
+            top_k,
+            tokenizer,
+            sax_model,
+        )
+    )
+    tasks.append(task)
+  await asyncio.gather(*tasks)
+
+
+def main(args: argparse.Namespace):
+  print(args)
+  random.seed(args.seed)
+  np.random.seed(args.seed)
+
+  api_url = f"http://{args.host}:{args.port}/{args.endpoint}"
+  tokenizer = AutoTokenizer.from_pretrained(
+      args.tokenizer, trust_remote_code=args.trust_remote_code
+  )
+  input_requests = sample_requests(
+      args.dataset,
+      args.num_prompts,
+      args.max_input_length,
+      args.max_output_length,
+      tokenizer,
+      args.use_dummy_text,
+  )
+
+  benchmark_start_time = time.time()
+  asyncio.run(
+      benchmark(
+          args.backend,
+          api_url,
+          input_requests,
+          args.best_of,
+          args.use_beam_search,
+          args.request_rate,
+          args.top_k,
+          tokenizer,
+          args.sax_model,
+      )
+  )
+  benchmark_end_time = time.time()
+  benchmark_time = benchmark_end_time - benchmark_start_time
+  print(f"Total time: {benchmark_time:.2f} s")
+  print(f"Requests/min: {60 * args.num_prompts / benchmark_time:.2f}")
+
+  total_output_tokens = np.sum([output_len for _, output_len, _ in
+                                REQUEST_LATENCY])
+  output_tokens_per_min = 60 * total_output_tokens / benchmark_time
+  print(f"Output_tokens/min: {output_tokens_per_min:.2f}")
+
+  total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in
+                               REQUEST_LATENCY])
+  input_tokens_per_min = 60 * total_input_tokens / benchmark_time
+  print(f"Input_tokens/min: {input_tokens_per_min:.2f}")
+
+  total_tokens = total_input_tokens + total_output_tokens
+  tokens_per_min = 60 * total_tokens / benchmark_time
+  print(f"Tokens/min: {tokens_per_min:.2f}")
+
+  if args.machine_cost:
+    print(
+        "Cost $/1k tokens:"
+        f" {args.machine_cost * 1000 / (60 * output_tokens_per_min)}"
+    )
+  # NOTE: The latency below includes requests awaiting time on server side.
+  # It's not comparable with the model inference latency for batch size 1.
+  avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY])
+  print(
+      "Average seconds/request (includes waiting time on server):"
+      f" {avg_latency:.2f}"
+  )
+
+  avg_per_token_latency = np.mean([
+      latency / (prompt_len + output_len)
+      for prompt_len, output_len, latency in REQUEST_LATENCY
+  ])
+  print(
+      "Average milliseconds/token (includes waiting time on server):"
+      f" {1000 * avg_per_token_latency:.2f}"
+  )
+
+  avg_per_output_token_latency = np.mean(
+      [latency / output_len for _, output_len, latency in REQUEST_LATENCY]
+  )
+  print(
+      "Average milliseconds/output_token (includes waiting time on server):"
+      f" {1000 * avg_per_output_token_latency:.2f}"
+  )
+
+  avg_input_len = np.mean(
+      [prompt_len for prompt_len, _, _ in REQUEST_LATENCY]
+  )
+  print(
+      "Average input length:"
+      f" {avg_input_len:.2f}"
+  )
+
+  avg_output_len = np.mean(
+      [output_len for _, output_len, _ in REQUEST_LATENCY]
+  )
+  print(
+      "Average output length:"
+      f" {avg_output_len:.2f}"
+  )
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(
+      description="Benchmark the online serving throughput."
+  )
+  parser.add_argument(
+      "--backend",
+      type=str,
+      default="vllm",
+      choices=[
+          "vllm",
+          "tgi",
+          "naive_transformers",
+          "tensorrt_llm_triton",
+          "sax",
+      ],
+  )
+  parser.add_argument(
+      "--sax_model",
+      type=str,
+      default="",
+      help="Model name to send request to at API server for SAX model server.",
+  )
+  parser.add_argument("--endpoint", type=str, default="generate")
+  parser.add_argument("--host", type=str, default="localhost")
+  parser.add_argument("--port", type=int, default=7080)
+  parser.add_argument("--dataset", type=str, help="Path to the dataset.")
+  parser.add_argument(
+      "--tokenizer",
+      type=str,
+      required=True,
+      help="Name or path of the tokenizer.",
+  )
+  parser.add_argument(
+      "--best-of",
+      type=int,
+      default=1,
+      help="Generates `best_of` sequences per prompt and returns the best one.",
+  )
+  parser.add_argument("--use-beam-search", action="store_true")
+  parser.add_argument(
+      "--num-prompts",
+      type=int,
+      default=1000,
+      help="Number of prompts to process.",
+  )
+  parser.add_argument(
+      "--max-input-length",
+      type=int,
+      default=1024,
+      help=(
+          "Maximum number of input tokens for filtering the benchmark dataset."
+      ),
+  )
+  parser.add_argument(
+      "--max-output-length",
+      type=int,
+      default=1024,
+      help=(
+          "Maximum number of input tokens for filtering the benchmark dataset."
+      ),
+  )
+  parser.add_argument(
+      "--top-k",
+      type=int,
+      default=32000,
+      help=(
+          "Number of candidate tokens that are considered at each step of the"
+          " generation process. 32000 is the vocab_size of Open-LLaMA and"
+          " LLaMA2 models."
+      ),
+  )
+  parser.add_argument(
+      "--request-rate",
+      type=float,
+      default=float("inf"),
+      help=(
+          "Number of requests per second. If this is inf, "
+          "then all the requests are sent at time 0. "
+          "Otherwise, we use Poisson process to synthesize "
+          "the request arrival times."
+      ),
+  )
+  parser.add_argument("--seed", type=int, default=0)
+  parser.add_argument(
+      "--trust-remote-code",
+      action="store_true",
+      help="trust remote code from huggingface",
+  )
+  parser.add_argument(
+      "--machine-cost",
+      type=float,
+      default=None,
+      help="Machine cost per hour including accelerators (if any)",
+  )
+  parser.add_argument(
+      "--use-dummy-text",
+      action="store_true",
+      help=(
+          "Whether to use dummy text with length defined by max_input_length"
+          " and max_output_length."
+      ),
+  )
+  cmd_args = parser.parse_args()
+  main(cmd_args)
diff --git a/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
new file mode 100755
index 000000000..e9c398aeb
--- /dev/null
+++ b/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Copyright 2024 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -o xtrace
+
+export IP=$IP
+
+huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential
+
+timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
+output_file="latency-profile-${timestamp}.txt"
+for ((i = 1 ; i <= 256 ; i*=2 )); do
+  python3 benchmark_serving.py   --host="$IP"   --port=80   --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$i --backend="$BACKEND" --num-prompts=256 --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
+done
+
+curl -H "Metadata-Flavor:Google" http://169.254.169.254/computeMetadata/v1/instance/service-accounts/default/email
+
+gsutil cp $output_file "gs://$OUTPUT_BUCKET/$output_file"
diff --git a/benchmarks/benchmark/tools/latency-profile/container/requirements.txt b/benchmarks/benchmark/tools/latency-profile/container/requirements.txt
new file mode 100644
index 000000000..4d1d37e18
--- /dev/null
+++ b/benchmarks/benchmark/tools/latency-profile/container/requirements.txt
@@ -0,0 +1,37 @@
+# formatting
+yapf==0.32.0
+toml==0.10.2
+ruff==0.1.5
+
+# type checking
+mypy==0.991
+types-PyYAML
+types-requests
+types-setuptools
+
+# testing
+pytest
+pytest-forked
+pytest-asyncio
+httpx
+einops # required for MPT
+flash_attn # required for HuggingFace's llama implementation
+openai
+requests
+
+# run
+ninja  # For faster builds.
+psutil
+ray >= 2.9
+sentencepiece  # Required for LLaMA tokenizer.
+numpy
+torch == 2.1.1
+transformers >= 4.37.0 # Required for Qwen2
+xformers == 0.0.23
+fastapi
+uvicorn[standard]
+pydantic >= 2.0  # Required for OpenAI server.
+aioprometheus[starlette]
+pynvml == 11.5.0
+accelerate
+aiohttp
diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
new file mode 100644
index 000000000..24c91efcd
--- /dev/null
+++ b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -0,0 +1,71 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+locals {
+  templates = [
+    for f in fileset(local.templates_path, "*tpl") :
+    "${local.templates_path}/${f}"
+  ]
+  templates_path = (
+    var.templates_path == null
+    ? "${path.module}/manifest-templates"
+    : pathexpand(var.templates_path)
+  )
+  hugging_face_token_secret = (
+    var.hugging_face_secret == null || var.hugging_face_secret_version == null
+    ? null
+    : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}"
+  )
+
+  all_manifests = flatten([for manifest_file in local.templates :
+    [for data in split("---", templatefile(manifest_file, {
+      artifact_registry                          = var.artifact_registry
+      namespace                                  = var.namespace
+      inference_server_service                   = var.inference_server_service
+      inference_server_framework                 = var.inference_server_framework
+      ksa                                        = var.ksa
+      latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
+      max_num_prompts                            = var.max_num_prompts
+      max_output_len                             = var.max_output_len
+      max_prompt_len                             = var.max_prompt_len
+      tokenizer                                  = var.tokenizer
+      hugging_face_token_secret_list             = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
+      k8s_hf_secret_list                         = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
+      output_bucket                              = var.output_bucket
+    })) : data]
+  ])
+}
+
+resource "google_project_service" "cloudbuild" {
+  project = var.project_id
+  service = "cloudbuild.googleapis.com"
+
+  timeouts {
+    create = "30m"
+    update = "40m"
+  }
+
+  disable_on_destroy = false
+}
+
+resource "kubernetes_manifest" "default" {
+  for_each   = toset(local.all_manifests)
+  depends_on = [resource.null_resource.build_and_push_image]
+  manifest   = yamldecode(each.value)
+  timeouts {
+    create = "30m"
+  }
+}
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
new file mode 100644
index 000000000..858ac6df3
--- /dev/null
+++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
@@ -0,0 +1,43 @@
+apiVersion: "apps/v1"
+kind: "Deployment"
+metadata:
+  name: lantency-profile-generator
+  namespace: ${namespace}
+  labels:
+    name: lantency-profile-generator
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: lantency-profile-generator
+  template:
+    metadata:
+      labels:
+        app: lantency-profile-generator
+        examples.ai.gke.io/source: ai-on-gke-benchmarks
+    spec:
+      serviceAccountName: ${latency_profile_kubernetes_service_account}
+      containers:
+        - name: lantency-profile-generator
+          image: ${artifact_registry}/latency-profile:latest
+          command: ["bash", "-c", "./latency_throughput_curve.sh"]
+          env:
+            - name: TOKENIZER
+              value: ${tokenizer}
+            - name: IP
+              value: ${inference_server_service}
+            - name: BACKEND
+              value: ${inference_server_framework}
+            - name: INPUT_LENGTH
+              value: ${max_prompt_len}
+            - name: OUTPUT_LENGTH
+              value: ${max_output_len}
+            - name: OUTPUT_BUCKET
+              value: ${output_bucket}
+%{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: HF_TOKEN
+%{ endfor ~}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/providers.tf b/benchmarks/benchmark/tools/latency-profile/providers.tf
new file mode 100644
index 000000000..70c82e817
--- /dev/null
+++ b/benchmarks/benchmark/tools/latency-profile/providers.tf
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+data "google_client_config" "identity" {
+  count = var.credentials_config.fleet_host != null ? 1 : 0
+}
+
+provider "kubernetes" {
+  config_path = (
+    var.credentials_config.kubeconfig == null
+    ? null
+    : pathexpand(var.credentials_config.kubeconfig.path)
+  )
+  config_context = try(
+    var.credentials_config.kubeconfig.context, null
+  )
+  host = (
+    var.credentials_config.fleet_host == null
+    ? null
+    : var.credentials_config.fleet_host
+  )
+  token = try(data.google_client_config.identity.0.access_token, null)
+}
diff --git a/benchmarks/benchmark/tools/latency-profile/sample.tfvars b/benchmarks/benchmark/tools/latency-profile/sample.tfvars
new file mode 100644
index 000000000..44bd0c875
--- /dev/null
+++ b/benchmarks/benchmark/tools/latency-profile/sample.tfvars
@@ -0,0 +1,21 @@
+credentials_config = {
+  fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUM/locations/global/gkeMemberships/ai-benchmark"
+}
+
+project_id = "$PROJECT_ID"
+
+namespace = "benchmark"
+ksa       = "benchmark-ksa"
+
+k8s_hf_secret = "hf-token"
+
+# Locust service configuration
+artifact_registry                          = "us-central1-docker.pkg.dev/$PROJECT_ID/ai-benchmark"
+inference_server_service                   = "tgi" # inference server service name
+latency_profile_kubernetes_service_account = "sample-runner-ksa"
+output_bucket                              = "${PROJECT_ID}-benchmark-output"
+gcs_path                                   = "gs://${PROJECT_ID}-ai-gke-benchmark-fuse/ShareGPT_V3_unfiltered_cleaned_split_filtered_prompts.txt"
+
+# Benchmark configuration for Locust Docker accessing inference server
+inference_server_framework = "tgi"
+tokenizer                  = "tiiuae/falcon-7b"
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
new file mode 100644
index 000000000..da93cf31a
--- /dev/null
+++ b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -0,0 +1,153 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+variable "credentials_config" {
+  description = "Configure how Terraform authenticates to the cluster."
+  type = object({
+    fleet_host = optional(string)
+    kubeconfig = optional(object({
+      context = optional(string)
+      path    = optional(string, "~/.kube/config")
+    }))
+  })
+  nullable = false
+  validation {
+    condition = (
+      (var.credentials_config.fleet_host != null) !=
+      (var.credentials_config.kubeconfig != null)
+    )
+    error_message = "Exactly one of fleet host or kubeconfig must be set."
+  }
+}
+
+variable "namespace" {
+  description = "Namespace used for model and benchmarking deployments."
+  type        = string
+  nullable    = false
+  default     = "default"
+}
+
+variable "project_id" {
+  description = "Project id of existing or created project."
+  type        = string
+  nullable    = false
+}
+
+variable "ksa" {
+  description = "Kubernetes Service Account used for workload."
+  type        = string
+  nullable    = false
+  default     = "default"
+}
+
+variable "templates_path" {
+  description = "Path where manifest templates will be read from. Set to null to use the default manifests"
+  type        = string
+  default     = null
+}
+
+variable "artifact_registry" {
+  description = "Artifact registry for storing Locust container."
+  type        = string
+  default     = null
+}
+
+variable "inference_server_service" {
+  description = "Inference server service"
+  type        = string
+  nullable    = false
+}
+
+variable "inference_server_framework" {
+  description = "Benchmark server configuration for inference server framework. Can be one of: vllm, tgi, tensorrt_llm_triton, sax"
+  type        = string
+  nullable    = false
+  default     = "tgi"
+  validation {
+    condition     = var.inference_server_framework == "vllm" || var.inference_server_framework == "tgi" || var.inference_server_framework == "tensorrt_llm_triton" || var.inference_server_framework == "sax" || var.inference_server_framework == "jetstream"
+    error_message = "The inference_server_framework must be one of: vllm, tgi, tensorrt_llm_triton, sax, or jetstream."
+  }
+}
+
+variable "max_num_prompts" {
+  description = "Benchmark server configuration for max number of prompts."
+  type        = number
+  default     = 1000
+  validation {
+    condition     = var.max_num_prompts > 0
+    error_message = "The max_num_prompts value must be greater than 0."
+  }
+}
+
+variable "max_output_len" {
+  description = "Benchmark server configuration for max output length."
+  type        = number
+  default     = 256
+  validation {
+    condition     = var.max_output_len > 4
+    error_message = "The max_output_len value must be greater than 4. TGI framework throws an error for too short of sequences."
+  }
+}
+
+variable "max_prompt_len" {
+  description = "Benchmark server configuration for max prompt length."
+  type        = number
+  default     = 256
+  validation {
+    condition     = var.max_prompt_len > 4
+    error_message = "The max_prompt_len value must be greater than 4. TGI framework throws an error for too short of sequences."
+  }
+}
+
+variable "tokenizer" {
+  description = "Benchmark server configuration for tokenizer."
+  type        = string
+  nullable    = false
+  default     = "tiiuae/falcon-7b"
+}
+
+variable "output_bucket" {
+  description = "Bucket name for storing results"
+  type        = string
+}
+
+variable "latency_profile_kubernetes_service_account" {
+  description = "Kubernetes Service Account to be used for the latency profile generator tool"
+  type        = string
+  default     = "sample-runner-ksa"
+}
+
+// TODO: add validation to make k8s_hf_secret & hugging_face_secret mutually exclusive once terraform is updated with: https://discuss.hashicorp.com/t/experiment-feedback-input-variable-validation-can-cross-reference-other-objects/66644
+variable "k8s_hf_secret" {
+  description = "Name of secret for huggingface token; stored in k8s "
+  type        = string
+  nullable    = true
+  default     = null
+}
+
+variable "hugging_face_secret" {
+  description = "name of the kubectl huggingface secret token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
+  type        = string
+  nullable    = true
+  default     = null
+}
+
+variable "hugging_face_secret_version" {
+  description = "Secret version in Secret Manager"
+  type        = string
+  nullable    = true
+  default     = null
+}

From 022ccc8ed8e263e84204fb1c3546897e1be99345 Mon Sep 17 00:00:00 2001
From: Ashok Chandrasekar <achandrasekar@google.com>
Date: Mon, 12 Aug 2024 18:48:23 +0000
Subject: [PATCH 02/37] Update readme and GCS push steps

---
 .../benchmark/tools/latency-profile/README.md | 100 ++++++++----------
 .../container/latency_throughput_curve.sh     |   7 +-
 .../tools/latency-profile/sample.tfvars       |   2 +-
 .../tools/latency-profile/variables.tf        |   2 +-
 4 files changed, 51 insertions(+), 60 deletions(-)

diff --git a/benchmarks/benchmark/tools/latency-profile/README.md b/benchmarks/benchmark/tools/latency-profile/README.md
index cdaaa31aa..50dcba64e 100644
--- a/benchmarks/benchmark/tools/latency-profile/README.md
+++ b/benchmarks/benchmark/tools/latency-profile/README.md
@@ -1,26 +1,19 @@
-# AI on GKE Benchmark Locust
-
-<!-- BEGIN TOC -->
-- [AI on GKE Benchmark Locust](#ai-on-gke-benchmark-locust)
-  - [Overview](#overview)
-  - [Instructions](#instructions)
-    - [Step 1: prepare benchmark prompts](#step-1-prepare-benchmark-prompts)
-    - [Step 2: create and give service account access to view dataset](#step-2-create-and-give-service-account-access-to-view-dataset)
-    - [Step 3: create output bucket](#step-3-create-output-bucket)
-    - [Step 4: create and give service account access to write to output gcs bucket](#step-4-create-and-give-service-account-access-to-write-to-output-gcs-bucket)
-    - [Step 5: create artifact repository for automated Locust docker build](#step-5-create-artifact-repository-for-automated-locust-docker-build)
-    - [Step 6: create and configure terraform.tfvars](#step-6-create-and-configure-terraformtfvars)
-      - [\[optional\] set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig)
-      - [\[optional\] set up secret token in Secret Manager](#optional-set-up-secret-token-in-secret-manager)
-    - [Step 7: login to gcloud](#step-7-login-to-gcloud)
-    - [Step 8: terraform initialize, plan and apply](#step-8-terraform-initialize-plan-and-apply)
-    - [Step 9: start an end to end benchmark](#step-9-start-an-end-to-end-benchmark)
-      - [option 1: initiate a single end to end Locust benchmark run via curl command](#option-1-initiate-a-single-end-to-end-locust-benchmark-run-via-curl-command)
-      - [option 2: interactive benchmark with locust web ui](#option-2-interactive-benchmark-with-locust-web-ui)
-    - [Step 10: viewing metrics](#step-10-viewing-metrics)
-    - [Additional Tips](#additional-tips)
-  - [Variables](#variables)
-<!-- END TOC -->
+# AI on GKE Benchmark Latency Profile Generator
+
+<!-- TOC -->
+* [AI on GKE Benchmark Latency Profile Generator](#ai-on-gke-benchmark-latency-profile-generator)
+  * [Overview](#overview)
+  * [Instructions](#instructions)
+    * [Step 1: create output bucket](#step-1--create-output-bucket)
+    * [Step 2: create and give service account access to write to output gcs bucket](#step-2--create-and-give-service-account-access-to-write-to-output-gcs-bucket)
+    * [Step 5: create artifact repository for automated Latency Profile Generator docker build](#step-5--create-artifact-repository-for-automated-latency-profile-generator-docker-build)
+    * [Step 6: create and configure terraform.tfvars](#step-6--create-and-configure-terraformtfvars)
+      * [[optional] set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig)
+      * [[optional] set up secret token in Secret Manager](#optional-set-up-secret-token-in-secret-manager)
+    * [Step 7: login to gcloud](#step-7--login-to-gcloud)
+    * [Step 8: terraform initialize, plan and apply](#step-8--terraform-initialize-plan-and-apply)
+  * [Inputs](#inputs)
+<!-- TOC -->
 
 ## Overview
 
@@ -64,7 +57,7 @@ Your kubernetes service account will inherit the reader permissions.
 You will set the `lantency_profile_kubernetes_service_account` in your
 `terraform.tfvars` to the kubernetes service account name.
 
-### Step 5: create artifact repository for automated Locust docker build
+### Step 5: create artifact repository for automated Latency Profile Generator docker build
 
 The latency profile generator rebuilds the docker file on each terraform apply.
 The containers will be pushed to the given `artifact_registry`. This artifact
@@ -80,19 +73,19 @@ gcloud artifacts repositories create ai-benchmark --location=us-central1 --repos
 
 ### Step 6: create and configure terraform.tfvars
 
-Create a `terraform.tfvars` file. `./sample-tfvars/tgi-sample.tfvars` is
-provided as an example file. You can copy the file as a starting point.
+Create a `terraform.tfvars` file. `./sample-tfvars` is provided as an example
+file. You can copy the file as a starting point.
 Note that at a minimum you will have to change the existing
 `credentials_config`, `project_id`, and `artifact_registry`.
 
 ```bash
-cp ./sample-tfvars/tgi-sample.tfvars terraform.tfvars
+cp ./sample-tfvars terraform.tfvars
 ```
 
 Fill out your `terraform.tfvars` with the desired model and server configuration, referring to the list of required and optional variables [here](#variables). The following variables are required:
-- `credentials_config` - credentials for cluster to deploy Locust benchmark tool on
-- `project_id` - project id for enabling dependent services for building locust artifacts
-- `artifact_registry` - artifact registry to upload locust artifacts to
+- `credentials_config` - credentials for cluster to deploy Latency Profile Generator benchmark tool on
+- `project_id` - project id for enabling dependent services for building Latency Profile Generator artifacts
+- `artifact_registry` - artifact registry to upload Latency Profile Generator artifacts to
 - `inference_server_service` - an accessible service name for inference workload to be benchmarked **(Note: If you are using a non-80 port for your model server service, it should be specified here. Example: `my-service-name:9000`)**
 - `tokenizer` - must match the model running on the inference workload to be benchmarked
 - `inference_server_framework` - the inference workload framework
@@ -181,29 +174,26 @@ variables.
 
 <!-- BEGIN_TF_DOCS -->
 
-## Variables
-
-| Name                                                                                                                 | Description                                                                                                                 | Type                                                                                                                                                                                                        | Default              | Required |
-| -------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------- | :------: |
-| <a name="input_artifact_registry"></a> [artifact\_registry](#input\_artifact\_registry)                              | Artifact registry for storing Locust container                                                                              | `string`                                                                                                                                                                                                    | `null`               |   yes    |
-| <a name="input_best_of"></a> [best\_of](#input\_best\_of)                                                            | Benchmark server configuration for best of.                                                                                 | `number`                                                                                                                                                                                                    | `1`                  |    no    |
-| <a name="input_credentials_config"></a> [credentials\_config](#input\_credentials\_config)                           | Configure how Terraform authenticates to the cluster.                                                                       | <pre>object({<br>    fleet_host = optional(string)<br>    kubeconfig = optional(object({<br>      context = optional(string)<br>      path    = optional(string, "~/.kube/config")<br>    }))<br>  })</pre> | n/a                  |   yes    |
-| <a name="input_gcs_path"></a> [gcs\_path](#input\_gcs\_path)                                                         | Benchmark server configuration for gcs\_path for downloading prompts.                                                       | `string`                                                                                                                                                                                                    | n/a                  |   yes    |
-| <a name="input_inference_server_framework"></a> [inference\_server\_framework](#input\_inference\_server\_framework) | Benchmark server configuration for inference server framework. Can be one of: vllm, tgi, tensorrt\_llm\_triton, sax, or jetstream  | `string`                                                                                                                                                                                                    | `"tgi"`              |   yes    |
-| <a name="input_request_type"></a> [request\_type](#input\_request\_type) | Protocol to use when making requests to the model server. Can be `grpc` or `http` | `string`                                                                                                                                                                                                    | `"http"`              |   no    |
-| <a name="input_inference_server_ip"></a> [inference\_server\_ip](#input\_inference\_server\_ip)                      | Inference server ip address                                                                                                 | `string`                                                                                                                                                                                                    | n/a                  |   yes    |
-| <a name="input_ksa"></a> [ksa](#input\_ksa)                                                                          | Kubernetes Service Account used for workload.                                                                               | `string`                                                                                                                                                                                                    | `"default"`          |    no    |
-| <a name="lantency_profile_kubernetes_service_account"></a> [locust\_runner\_kubernetes\_service\_account](#locust\_runner\_kubernetes\_service\_account)                                                                          | "Kubernetes Service Account to be used for Locust runner tool. Must have storage.admin access to output_bucket"                                                                              | `string`                                                                                                                                                                                                    | `"sample-runner-ksa"`          |    no    |
-| <a name="input_output_bucket"></a> [output\_bucket](#output\_bucket)                                                                          | "Bucket name for storing results"                                                                              | `string`                                                                                                                                                                                                    | n/a          |    yes    |
-| <a name="input_max_num_prompts"></a> [max\_num\_prompts](#input\_max\_num\_prompts)                                  | Benchmark server configuration for max number of prompts.                                                                   | `number`                                                                                                                                                                                                    | `1000`               |    no    |
-| <a name="input_max_output_len"></a> [max\_output\_len](#input\_max\_output\_len)                                     | Benchmark server configuration for max output length.                                                                       | `number`                                                                                                                                                                                                    | `1024`               |    no    |
-| <a name="input_max_prompt_len"></a> [max\_prompt\_len](#input\_max\_prompt\_len)                                     | Benchmark server configuration for max prompt length.                                                                       | `number`                                                                                                                                                                                                    | `1024`               |    no    |
-| <a name="input_num_locust_workers"></a> [num\_locust\_workers](#input\num\_locust\_workers)                          | Number of locust worker pods to deploy.                                                                                     | `number`                                                                                                                                                                                                    | `1`                  |    no    |
-| <a name="input_namespace"></a> [namespace](#input\_namespace)                                                        | Namespace used for model and benchmarking deployments.                                                                      | `string`                                                                                                                                                                                                    | `"default"`          |    no    |
-| <a name="input_project_id"></a> [project\_id](#input\_project\_id)                                                      | Project id of existing or created project.                                           | `string` | n/a                  |   yes    |
-| <a name="input_sax_model"></a> [sax\_model](#input\_sax\_model)                                                      | Benchmark server configuration for sax model. Only required if framework is sax.                                            | `string`                                                                                                                                                                                                    | `""`                 |    no    |
-| <a name="input_tokenizer"></a> [tokenizer](#input\_tokenizer)                                                        | Benchmark server configuration for tokenizer.                                                                               | `string`                                                                                                                                                                                                    | `"tiiuae/falcon-7b"` |   yes    |
-| <a name="input_use_beam_search"></a> [use\_beam\_search](#input\_use\_beam\_search)                                  | Benchmark server configuration for use beam search.                                                                         | `bool`                                                                                                                                                                                                      | `false`              |    no    |
-  <a name="huggingface_secret"></a> [huggingface_secret](#input\_huggingface_secret)                                  | Name of the secret holding the huggingface token. Stored in GCP Secrets Manager.                                                                          | `string`                                                                                                                                                                                                      | `huggingface-secret`              |    no   |
-  <a name="k8s_hf_secret"></a> [k8s_hf_secret](#input\_huggingface_secret)                                  | Name of the secret holding the huggingface token. Stored in K8s. Key is expected to be named: `HF_TOKEN`. See [here](https://kubernetes.io/docs/tasks/configmap-secret/managing-secret-using-kubectl/#use-raw-data) for more.                                                                          | `string`                                                                                                                                                                                                      | `huggingface-secret`              |    no   |
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| <a name="input_artifact_registry"></a> [artifact\_registry](#input\_artifact\_registry) | Artifact registry for storing Latency Profile Generator container. | `string` | `null` | no |
+| <a name="input_credentials_config"></a> [credentials\_config](#input\_credentials\_config) | Configure how Terraform authenticates to the cluster. | <pre>object({<br>    fleet_host = optional(string)<br>    kubeconfig = optional(object({<br>      context = optional(string)<br>      path    = optional(string, "~/.kube/config")<br>    }))<br>  })</pre> | n/a | yes |
+| <a name="input_hugging_face_secret"></a> [hugging\_face\_secret](#input\_hugging\_face\_secret) | name of the kubectl huggingface secret token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/ | `string` | `null` | no |
+| <a name="input_hugging_face_secret_version"></a> [hugging\_face\_secret\_version](#input\_hugging\_face\_secret\_version) | Secret version in Secret Manager | `string` | `null` | no |
+| <a name="input_inference_server_framework"></a> [inference\_server\_framework](#input\_inference\_server\_framework) | Benchmark server configuration for inference server framework. Can be one of: vllm, tgi, tensorrt\_llm\_triton, sax | `string` | `"tgi"` | no |
+| <a name="input_inference_server_service"></a> [inference\_server\_service](#input\_inference\_server\_service) | Inference server service | `string` | n/a | yes |
+| <a name="input_k8s_hf_secret"></a> [k8s\_hf\_secret](#input\_k8s\_hf\_secret) | Name of secret for huggingface token; stored in k8s | `string` | `null` | no |
+| <a name="input_ksa"></a> [ksa](#input\_ksa) | Kubernetes Service Account used for workload. | `string` | `"default"` | no |
+| <a name="input_latency_profile_kubernetes_service_account"></a> [latency\_profile\_kubernetes\_service\_account](#input\_latency\_profile\_kubernetes\_service\_account) | Kubernetes Service Account to be used for the latency profile generator tool | `string` | `"sample-runner-ksa"` | no |
+| <a name="input_max_num_prompts"></a> [max\_num\_prompts](#input\_max\_num\_prompts) | Benchmark server configuration for max number of prompts. | `number` | `1000` | no |
+| <a name="input_max_output_len"></a> [max\_output\_len](#input\_max\_output\_len) | Benchmark server configuration for max output length. | `number` | `256` | no |
+| <a name="input_max_prompt_len"></a> [max\_prompt\_len](#input\_max\_prompt\_len) | Benchmark server configuration for max prompt length. | `number` | `256` | no |
+| <a name="input_namespace"></a> [namespace](#input\_namespace) | Namespace used for model and benchmarking deployments. | `string` | `"default"` | no |
+| <a name="input_output_bucket"></a> [output\_bucket](#input\_output\_bucket) | Bucket name for storing results | `string` | n/a | yes |
+| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project id of existing or created project. | `string` | n/a | yes |
+| <a name="input_templates_path"></a> [templates\_path](#input\_templates\_path) | Path where manifest templates will be read from. Set to null to use the default manifests | `string` | `null` | no |
+| <a name="input_tokenizer"></a> [tokenizer](#input\_tokenizer) | Benchmark server configuration for tokenizer. | `string` | `"tiiuae/falcon-7b"` | no |
+
 <!-- END_TF_DOCS -->
diff --git a/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
index e9c398aeb..6cae3199e 100755
--- a/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
+++ b/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
@@ -21,10 +21,11 @@ huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential
 
 timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
 output_file="latency-profile-${timestamp}.txt"
-for ((i = 1 ; i <= 256 ; i*=2 )); do
-  python3 benchmark_serving.py   --host="$IP"   --port=80   --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$i --backend="$BACKEND" --num-prompts=256 --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
+for ((i = 1 ; i <= 2 ; i*=2 )); do
+  python3 benchmark_serving.py   --host="$IP"   --port=80   --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$i --backend="$BACKEND" --num-prompts=2 --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
 done
 
-curl -H "Metadata-Flavor:Google" http://169.254.169.254/computeMetadata/v1/instance/service-accounts/default/email
+TOKEN=$(curl -s -H "Metadata-Flavor: Google" http://169.254.169.254/computeMetadata/v1/instance/service-accounts/default/token | jq -r .access_token)
+gsutil config -e -o Credentials:gs_oauth2_refresh_token=$TOKEN
 
 gsutil cp $output_file "gs://$OUTPUT_BUCKET/$output_file"
diff --git a/benchmarks/benchmark/tools/latency-profile/sample.tfvars b/benchmarks/benchmark/tools/latency-profile/sample.tfvars
index 44bd0c875..d0cf45dfb 100644
--- a/benchmarks/benchmark/tools/latency-profile/sample.tfvars
+++ b/benchmarks/benchmark/tools/latency-profile/sample.tfvars
@@ -9,7 +9,7 @@ ksa       = "benchmark-ksa"
 
 k8s_hf_secret = "hf-token"
 
-# Locust service configuration
+# Latency profile generator service configuration
 artifact_registry                          = "us-central1-docker.pkg.dev/$PROJECT_ID/ai-benchmark"
 inference_server_service                   = "tgi" # inference server service name
 latency_profile_kubernetes_service_account = "sample-runner-ksa"
diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
index da93cf31a..ba88bdeec 100644
--- a/benchmarks/benchmark/tools/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -60,7 +60,7 @@ variable "templates_path" {
 }
 
 variable "artifact_registry" {
-  description = "Artifact registry for storing Locust container."
+  description = "Artifact registry for storing Latency Profile Generator container."
   type        = string
   default     = null
 }

From d614688d9ced2ee9627cf52db6b9695895c740f6 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 13 Aug 2024 22:11:27 +0000
Subject: [PATCH 03/37] first commit

---
 benchmarks/benchmark/profiling/README.md      |  8 +++
 benchmarks/benchmark/profiling/manifest.json  | 58 +++++++++++++++++++
 .../container/benchmark_serving.py            |  8 +++
 .../benchmark/tools/latency-profile/main.tf   |  7 +--
 .../manifest-templates/hf-secret.yaml.tpl     |  7 +++
 .../latency-profile-generator.yaml.tpl        |  4 +-
 .../manifest-templates/namespace.yaml.tpl     |  4 ++
 .../service-account.yaml.tpl                  |  5 ++
 .../tools/latency-profile/variables.tf        | 16 ++---
 9 files changed, 96 insertions(+), 21 deletions(-)
 create mode 100644 benchmarks/benchmark/profiling/README.md
 create mode 100644 benchmarks/benchmark/profiling/manifest.json
 create mode 100644 benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl
 create mode 100644 benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl
 create mode 100644 benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl

diff --git a/benchmarks/benchmark/profiling/README.md b/benchmarks/benchmark/profiling/README.md
new file mode 100644
index 000000000..2c3f6694d
--- /dev/null
+++ b/benchmarks/benchmark/profiling/README.md
@@ -0,0 +1,8 @@
+This directory contains the configuration for running performance profiling across various model, model server, accelerator, and request rate combinations. The configuration file outlines the specific constraints and setups for benchmarking the performance of different models under various conditions. Below is an explanation of each field:
+
+ - models: Which models will be benchmarked?
+ - accelerators: Which accelerators will be used for benchmarking?
+ - request_rates: For each model, model server, and accelerator combination, which request rates do we want to benchmark?
+ - model_servers: Enumerates the model servers to be benchmarked, along with their specific configurations:
+   - models: Indicates which models each server is capable of running, returns all accelerators that contain one or more of these strings.
+   - accelerators: Which accelerators does this model server support, returns all accelerators that contains one or more of these strings.
\ No newline at end of file
diff --git a/benchmarks/benchmark/profiling/manifest.json b/benchmarks/benchmark/profiling/manifest.json
new file mode 100644
index 000000000..b9eb11d28
--- /dev/null
+++ b/benchmarks/benchmark/profiling/manifest.json
@@ -0,0 +1,58 @@
+{
+    "models": [
+        "gemma2-2b", 
+        "gemma2-9b", 
+        "gemma2-27b",
+        "llama3-8b", 
+        "llama3-70b", 
+        "llama3-405b"
+    ],
+    "accelerators" : [
+        "tpu-v4-podslice",
+        "tpu-v5-lite-podslice",
+        "tpu-v5p-slice",
+        "nvidia-a100-80gb",
+        "nvidia-h100-80gb",
+        "nvidia-l4"
+    ],
+    "request_rates" : [1,2,3,4,6,8,12,16,24,32,48,64,96,128],
+    
+    "model_servers" : {
+        "Jetstream" : {
+            "models" : [  
+            "gemma2-2b", 
+            "gemma2-9b", 
+            "gemma2-27b"
+        ],
+            "accelerators": ["tpu"]
+        },
+        "vllm" : {
+            "models" : [
+            "gemma2-2b", 
+            "gemma2-9b", 
+            "gemma2-27b",
+            "llama3-8b", 
+            "llama3-70b", 
+            "llama3-405b"
+            ]
+        },
+        "tgi": {
+            "models" : [ 
+                "gemma2-2b", 
+                "gemma2-9b", 
+                "gemma2-27b",       
+                "llama3-8b", 
+                "llama3-70b", 
+                "llama3-405b"
+            ]
+        },
+        "tensorrt-llm": {
+            "models" : [
+            "llama3-8b", 
+            "llama3-70b", 
+            "llama3-405b"
+        ],
+            "accelerators": ["nvidia"]
+        }
+    }
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py b/benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py
index 19332dbf9..4fd25db30 100644
--- a/benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py
@@ -177,6 +177,11 @@ async def send_request(
         "max_tokens": output_len,
         "stream": False,
     }
+  elif backend == "jetstream":
+    pload = {
+        "prompt": prompt,
+        "max_tokens": 1,
+    }
   else:
     raise ValueError(f"Unknown backend: {backend}")
 
@@ -218,6 +223,9 @@ async def send_request(
     total_token_ids = tokenizer(output["text"][0]).input_ids
     new_total_len = len(total_token_ids)
     output_len = new_total_len - prompt_len
+  elif backend == "jetstream":
+    output_token_ids = tokenizer(output["response"]).input_ids
+    output_len = len(output_token_ids)
 
   request_latency = request_end_time - request_start_time
   REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
index 24c91efcd..5016fdb0e 100644
--- a/benchmarks/benchmark/tools/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -24,11 +24,6 @@ locals {
     ? "${path.module}/manifest-templates"
     : pathexpand(var.templates_path)
   )
-  hugging_face_token_secret = (
-    var.hugging_face_secret == null || var.hugging_face_secret_version == null
-    ? null
-    : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}"
-  )
 
   all_manifests = flatten([for manifest_file in local.templates :
     [for data in split("---", templatefile(manifest_file, {
@@ -42,7 +37,7 @@ locals {
       max_output_len                             = var.max_output_len
       max_prompt_len                             = var.max_prompt_len
       tokenizer                                  = var.tokenizer
-      hugging_face_token_secret_list             = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
+      hugging_face_token_b64                     = var.hugging_face_token_b64
       k8s_hf_secret_list                         = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
       output_bucket                              = var.output_bucket
     })) : data]
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl
new file mode 100644
index 000000000..d61629ee4
--- /dev/null
+++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl
@@ -0,0 +1,7 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: hf-token
+  namespace: ${namespace}
+data:
+  HF_TOKEN: ${hugging_face_token_b64}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
index 858ac6df3..52b01a631 100644
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
+++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
@@ -34,10 +34,8 @@ spec:
               value: ${max_output_len}
             - name: OUTPUT_BUCKET
               value: ${output_bucket}
-%{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
             - name: HF_TOKEN
               valueFrom:
                 secretKeyRef:
                   name: hf-token
-                  key: HF_TOKEN
-%{ endfor ~}
\ No newline at end of file
+                  key: HF_TOKEN
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl
new file mode 100644
index 000000000..95e400737
--- /dev/null
+++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: ${namespace}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl
new file mode 100644
index 000000000..da0da4267
--- /dev/null
+++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl
@@ -0,0 +1,5 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: ${latency_profile_kubernetes_service_account}
+  namespace: ${namespace}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
index ba88bdeec..62915f15b 100644
--- a/benchmarks/benchmark/tools/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -138,16 +138,8 @@ variable "k8s_hf_secret" {
   default     = null
 }
 
-variable "hugging_face_secret" {
-  description = "name of the kubectl huggingface secret token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
+variable "hugging_face_token_b64" {
+  description = "Base 64 encoded hugging face token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
   type        = string
-  nullable    = true
-  default     = null
-}
-
-variable "hugging_face_secret_version" {
-  description = "Secret version in Secret Manager"
-  type        = string
-  nullable    = true
-  default     = null
-}
+  nullable    = false
+}
\ No newline at end of file

From c7f36abb58efa4dfbbf9c0a2caa75ec8bbfb17c5 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 13 Aug 2024 22:12:56 +0000
Subject: [PATCH 04/37] remove profiling

---
 benchmarks/benchmark/profiling/README.md     |  8 ---
 benchmarks/benchmark/profiling/manifest.json | 58 --------------------
 2 files changed, 66 deletions(-)
 delete mode 100644 benchmarks/benchmark/profiling/README.md
 delete mode 100644 benchmarks/benchmark/profiling/manifest.json

diff --git a/benchmarks/benchmark/profiling/README.md b/benchmarks/benchmark/profiling/README.md
deleted file mode 100644
index 2c3f6694d..000000000
--- a/benchmarks/benchmark/profiling/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-This directory contains the configuration for running performance profiling across various model, model server, accelerator, and request rate combinations. The configuration file outlines the specific constraints and setups for benchmarking the performance of different models under various conditions. Below is an explanation of each field:
-
- - models: Which models will be benchmarked?
- - accelerators: Which accelerators will be used for benchmarking?
- - request_rates: For each model, model server, and accelerator combination, which request rates do we want to benchmark?
- - model_servers: Enumerates the model servers to be benchmarked, along with their specific configurations:
-   - models: Indicates which models each server is capable of running, returns all accelerators that contain one or more of these strings.
-   - accelerators: Which accelerators does this model server support, returns all accelerators that contains one or more of these strings.
\ No newline at end of file
diff --git a/benchmarks/benchmark/profiling/manifest.json b/benchmarks/benchmark/profiling/manifest.json
deleted file mode 100644
index b9eb11d28..000000000
--- a/benchmarks/benchmark/profiling/manifest.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-    "models": [
-        "gemma2-2b", 
-        "gemma2-9b", 
-        "gemma2-27b",
-        "llama3-8b", 
-        "llama3-70b", 
-        "llama3-405b"
-    ],
-    "accelerators" : [
-        "tpu-v4-podslice",
-        "tpu-v5-lite-podslice",
-        "tpu-v5p-slice",
-        "nvidia-a100-80gb",
-        "nvidia-h100-80gb",
-        "nvidia-l4"
-    ],
-    "request_rates" : [1,2,3,4,6,8,12,16,24,32,48,64,96,128],
-    
-    "model_servers" : {
-        "Jetstream" : {
-            "models" : [  
-            "gemma2-2b", 
-            "gemma2-9b", 
-            "gemma2-27b"
-        ],
-            "accelerators": ["tpu"]
-        },
-        "vllm" : {
-            "models" : [
-            "gemma2-2b", 
-            "gemma2-9b", 
-            "gemma2-27b",
-            "llama3-8b", 
-            "llama3-70b", 
-            "llama3-405b"
-            ]
-        },
-        "tgi": {
-            "models" : [ 
-                "gemma2-2b", 
-                "gemma2-9b", 
-                "gemma2-27b",       
-                "llama3-8b", 
-                "llama3-70b", 
-                "llama3-405b"
-            ]
-        },
-        "tensorrt-llm": {
-            "models" : [
-            "llama3-8b", 
-            "llama3-70b", 
-            "llama3-405b"
-        ],
-            "accelerators": ["nvidia"]
-        }
-    }
-}
\ No newline at end of file

From d5d983e6c9ed75ce675bfca7cb681ae6ad253957 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 13 Aug 2024 22:15:57 +0000
Subject: [PATCH 05/37] correct steps

---
 benchmarks/benchmark/tools/latency-profile/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark/tools/latency-profile/README.md b/benchmarks/benchmark/tools/latency-profile/README.md
index 50dcba64e..21a97234e 100644
--- a/benchmarks/benchmark/tools/latency-profile/README.md
+++ b/benchmarks/benchmark/tools/latency-profile/README.md
@@ -57,7 +57,7 @@ Your kubernetes service account will inherit the reader permissions.
 You will set the `lantency_profile_kubernetes_service_account` in your
 `terraform.tfvars` to the kubernetes service account name.
 
-### Step 5: create artifact repository for automated Latency Profile Generator docker build
+### Step 3: create artifact repository for automated Latency Profile Generator docker build
 
 The latency profile generator rebuilds the docker file on each terraform apply.
 The containers will be pushed to the given `artifact_registry`. This artifact
@@ -71,7 +71,7 @@ gcloud artifacts repositories create ai-benchmark --location=us-central1 --repos
 ```
 
 
-### Step 6: create and configure terraform.tfvars
+### Step 4: create and configure terraform.tfvars
 
 Create a `terraform.tfvars` file. `./sample-tfvars` is provided as an example
 file. You can copy the file as a starting point.

From 312fba689d01cf0016c8d03df98360c2d30ab89c Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 13 Aug 2024 22:16:28 +0000
Subject: [PATCH 06/37] correct steps

---
 benchmarks/benchmark/tools/latency-profile/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark/tools/latency-profile/README.md b/benchmarks/benchmark/tools/latency-profile/README.md
index 21a97234e..a329894cd 100644
--- a/benchmarks/benchmark/tools/latency-profile/README.md
+++ b/benchmarks/benchmark/tools/latency-profile/README.md
@@ -146,7 +146,7 @@ hugging_face_secret = $SECRET_ID
 hugging_face_secret_version =  $SECRET_VERSION
 ```
 
-### Step 7: login to gcloud
+### Step 5: login to gcloud
 
 Run the following gcloud command for authorization:
 
@@ -154,7 +154,7 @@ Run the following gcloud command for authorization:
 gcloud auth application-default login
 ```
 
-### Step 8: terraform initialize, plan and apply
+### Step 6: terraform initialize, plan and apply
 
 Run the following terraform commands:
 

From 89730048c90cb565e4d0da918d54462632e99c6a Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 13 Aug 2024 22:50:02 +0000
Subject: [PATCH 07/37] jetstream option for backend arg

---
 .../tools/latency-profile/container/benchmark_serving.py         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py b/benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py
index 4fd25db30..842b0af39 100644
--- a/benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py
@@ -376,6 +376,7 @@ def main(args: argparse.Namespace):
           "naive_transformers",
           "tensorrt_llm_triton",
           "sax",
+          "jetstream"
       ],
   )
   parser.add_argument(

From 3e622399fdf3f3317793cf8bb4190d0ae4e5ced7 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Wed, 14 Aug 2024 21:45:56 +0000
Subject: [PATCH 08/37] extra parameters

---
 .../tools/latency-profile/container/Dockerfile      |  1 -
 .../container/latency_throughput_curve.sh           |  6 +-----
 benchmarks/benchmark/tools/latency-profile/main.tf  |  2 ++
 .../latency-profile-generator.yaml.tpl              | 10 +++++++++-
 .../manifest-templates/service-account.yaml.tpl     |  4 +++-
 .../benchmark/tools/latency-profile/variables.tf    | 13 +++++++++++++
 6 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/benchmarks/benchmark/tools/latency-profile/container/Dockerfile b/benchmarks/benchmark/tools/latency-profile/container/Dockerfile
index 4c656a0b7..a133294a8 100644
--- a/benchmarks/benchmark/tools/latency-profile/container/Dockerfile
+++ b/benchmarks/benchmark/tools/latency-profile/container/Dockerfile
@@ -11,7 +11,6 @@ COPY requirements.txt requirements.txt
 RUN pip install -r requirements.txt
 
 RUN pip install -U "huggingface_hub[cli]"
-RUN pip install gsutil
 
 RUN wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 
diff --git a/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
index 6cae3199e..707f8271c 100755
--- a/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
+++ b/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
@@ -22,10 +22,6 @@ huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential
 timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
 output_file="latency-profile-${timestamp}.txt"
 for ((i = 1 ; i <= 2 ; i*=2 )); do
-  python3 benchmark_serving.py   --host="$IP"   --port=80   --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$i --backend="$BACKEND" --num-prompts=2 --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
+  python3 benchmark_serving.py   --host="$IP"   --port="$PORT"   --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$i --backend="$BACKEND" --num-prompts=2 --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
 done
 
-TOKEN=$(curl -s -H "Metadata-Flavor: Google" http://169.254.169.254/computeMetadata/v1/instance/service-accounts/default/token | jq -r .access_token)
-gsutil config -e -o Credentials:gs_oauth2_refresh_token=$TOKEN
-
-gsutil cp $output_file "gs://$OUTPUT_BUCKET/$output_file"
diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
index 5016fdb0e..42cf3146f 100644
--- a/benchmarks/benchmark/tools/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -30,9 +30,11 @@ locals {
       artifact_registry                          = var.artifact_registry
       namespace                                  = var.namespace
       inference_server_service                   = var.inference_server_service
+      inference_server_service_port              = var.inference_server_service_port
       inference_server_framework                 = var.inference_server_framework
       ksa                                        = var.ksa
       latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
+      google_service_account                     = var.google_service_account
       max_num_prompts                            = var.max_num_prompts
       max_output_len                             = var.max_output_len
       max_prompt_len                             = var.max_prompt_len
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
index 52b01a631..53f189854 100644
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
+++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
@@ -20,12 +20,17 @@ spec:
       containers:
         - name: lantency-profile-generator
           image: ${artifact_registry}/latency-profile:latest
+          resources:
+            limits:
+              nvidia.com/gpu: 1
           command: ["bash", "-c", "./latency_throughput_curve.sh"]
           env:
             - name: TOKENIZER
               value: ${tokenizer}
             - name: IP
               value: ${inference_server_service}
+            - name: PORT
+              value: ${inference_server_service_port}
             - name: BACKEND
               value: ${inference_server_framework}
             - name: INPUT_LENGTH
@@ -38,4 +43,7 @@ spec:
               valueFrom:
                 secretKeyRef:
                   name: hf-token
-                  key: HF_TOKEN
\ No newline at end of file
+                  key: HF_TOKEN
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-l4   # nvidia-h100-80gb, nvidia-l4
+        iam.gke.io/gke-metadata-server-enabled: "true"
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl
index da0da4267..02eff324f 100644
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl
+++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl
@@ -2,4 +2,6 @@ apiVersion: v1
 kind: ServiceAccount
 metadata:
   name: ${latency_profile_kubernetes_service_account}
-  namespace: ${namespace}
\ No newline at end of file
+  namespace: ${namespace}
+  annotations:
+    iam.gke.io/gcp-service-account: "${google_service_account}@tpu-vm-gke-testing.iam.gserviceaccount.com"
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
index 62915f15b..586150440 100644
--- a/benchmarks/benchmark/tools/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -71,6 +71,12 @@ variable "inference_server_service" {
   nullable    = false
 }
 
+variable "inference_server_service_port" {
+  description = "Inference server service port"
+  type        = number
+  nullable    = false
+}
+
 variable "inference_server_framework" {
   description = "Benchmark server configuration for inference server framework. Can be one of: vllm, tgi, tensorrt_llm_triton, sax"
   type        = string
@@ -130,6 +136,13 @@ variable "latency_profile_kubernetes_service_account" {
   default     = "sample-runner-ksa"
 }
 
+variable "google_service_account" {
+  description = "Google Service Account bound to the kubernetes service account"
+  type        = string
+  default     = ""
+  nullable    = false
+}
+
 // TODO: add validation to make k8s_hf_secret & hugging_face_secret mutually exclusive once terraform is updated with: https://discuss.hashicorp.com/t/experiment-feedback-input-variable-validation-can-cross-reference-other-objects/66644
 variable "k8s_hf_secret" {
   description = "Name of secret for huggingface token; stored in k8s "

From 44143e7c25fbf3faaf5b20a79180ed91c0fdd2f8 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Thu, 15 Aug 2024 20:31:29 +0000
Subject: [PATCH 09/37] configurable pipeline starting point, request rates
 configurable

---
 .../container/latency_throughput_curve.sh     |  8 +--
 .../benchmark/tools/latency-profile/main.tf   |  5 ++
 .../latency-profile-generator.yaml.tpl        |  2 +
 .../tools/latency-profile/variables.tf        | 58 +++++++++++++++++++
 .../benchmark/tools/profile-generator/main.tf |  0
 .../tools/profile-generator/variables.tf      |  0
 6 files changed, 69 insertions(+), 4 deletions(-)
 create mode 100644 benchmarks/benchmark/tools/profile-generator/main.tf
 create mode 100644 benchmarks/benchmark/tools/profile-generator/variables.tf

diff --git a/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
index 707f8271c..e7ae88b8a 100755
--- a/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
+++ b/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
@@ -19,9 +19,9 @@ export IP=$IP
 
 huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential
 
-timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
-output_file="latency-profile-${timestamp}.txt"
-for ((i = 1 ; i <= 2 ; i*=2 )); do
-  python3 benchmark_serving.py   --host="$IP"   --port="$PORT"   --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$i --backend="$BACKEND" --num-prompts=2 --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
+for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do
+  timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
+  output_file="latency-profile-${timestamp}.txt"
+  python3 benchmark_serving.py   --host="$IP"   --port="$PORT"   --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
 done
 
diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
index 42cf3146f..3906c6417 100644
--- a/benchmarks/benchmark/tools/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+## BEFORE APPLYING TEMPLATES
+
+# 1) Assure that we need to upload the new data point if either there is none of the existing one is unsatisfactory
+# 2) Use the `catalog generate` tool to generate the manifests and pipe them to `kubectl apply -f`, assure kubectl succeeds
 locals {
   templates = [
     for f in fileset(local.templates_path, "*tpl") :
@@ -38,6 +42,7 @@ locals {
       max_num_prompts                            = var.max_num_prompts
       max_output_len                             = var.max_output_len
       max_prompt_len                             = var.max_prompt_len
+      request_rates                              = join(",", [for number in var.request_rates : tostring(number)])
       tokenizer                                  = var.tokenizer
       hugging_face_token_b64                     = var.hugging_face_token_b64
       k8s_hf_secret_list                         = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
index 53f189854..c3fb03b83 100644
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
+++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
@@ -37,6 +37,8 @@ spec:
               value: ${max_prompt_len}
             - name: OUTPUT_LENGTH
               value: ${max_output_len}
+            - name: REQUEST_RATES
+              value: ${request_rates}
             - name: OUTPUT_BUCKET
               value: ${output_bucket}
             - name: HF_TOKEN
diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
index 586150440..896934cf0 100644
--- a/benchmarks/benchmark/tools/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -118,6 +118,13 @@ variable "max_prompt_len" {
   }
 }
 
+variable "request_rates" {
+  description = ""
+  type        = list(number)
+  default     = [1, 2]
+  nullable    = false
+}
+
 variable "tokenizer" {
   description = "Benchmark server configuration for tokenizer."
   type        = string
@@ -155,4 +162,55 @@ variable "hugging_face_token_b64" {
   description = "Base 64 encoded hugging face token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
   type        = string
   nullable    = false
+}
+
+variable "pipeline_config" {
+  description = "All combinations of model/model_server/accelerators to benchmark"
+  type = object({
+    valid_models       = list(string)
+    valid_accelerators = list(string)
+    request_rates      = list(number)
+
+    config = list(object({
+      model_server = string # Model server name
+      model_server_configs = list(object({
+        models = list(string) # model name
+        model_configs = list(object({
+          accelerators = list(string) # Accelerator name
+          accelerator_configs = list(object({
+            accelerator_count = number # Number of accelerators
+          }))
+        }))
+      }))
+    }))
+  })
+
+  validation {
+    condition = alltrue([
+      for cfg in var.pipeline_config.config : alltrue([
+        for model_server_config in cfg.model_server_configs : (
+          alltrue([
+            for model_config in model_server_config.model_configs :
+            alltrue([for accelerator in model_config.accelerators :
+            contains(var.pipeline_config.valid_accelerators, accelerator)])
+          ])
+        )
+      ])
+    ])
+    error_message = "Each accelerator must be in the valid_accelerators list."
+  }
+
+  validation {
+    condition = alltrue([
+      for cfg in var.pipeline_config.config : alltrue([
+        for model_server_config in cfg.model_server_configs : (
+          alltrue([
+            for model in model_server_config.models :
+            contains(var.pipeline_config.valid_models, model)
+          ])
+        )
+      ])
+    ])
+    error_message = "Each model must be in the valid_models list."
+  }
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf
new file mode 100644
index 000000000..e69de29bb

From 91fe58157daf7b8b38e597a8c2a23b361f59fb70 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Fri, 16 Aug 2024 22:00:18 +0000
Subject: [PATCH 10/37] WIP changes/reversions

---
 .../benchmark/tools/latency-profile/main.tf   |   8 +-
 .../manifest-templates/hf-secret.yaml.tpl     |   7 -
 .../latency-profile-generator.yaml.tpl        |   9 +
 .../manifest-templates/namespace.yaml.tpl     |   4 -
 .../service-account.yaml.tpl                  |   7 -
 .../tools/latency-profile/variables.tf        |  70 +-----
 .../benchmark/tools/profile-generator/main.tf |  39 ++++
 .../tools/profile-generator/sample.tfvars     | 105 +++++++++
 .../tools/profile-generator/variables.tf      | 218 ++++++++++++++++++
 9 files changed, 387 insertions(+), 80 deletions(-)
 delete mode 100644 benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl
 delete mode 100644 benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl
 delete mode 100644 benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl
 create mode 100644 benchmarks/benchmark/tools/profile-generator/sample.tfvars

diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
index 3906c6417..be95ee59e 100644
--- a/benchmarks/benchmark/tools/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -28,6 +28,11 @@ locals {
     ? "${path.module}/manifest-templates"
     : pathexpand(var.templates_path)
   )
+  hugging_face_token_secret = (
+    var.hugging_face_secret == null || var.hugging_face_secret_version == null
+    ? null
+    : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}"
+  )
 
   all_manifests = flatten([for manifest_file in local.templates :
     [for data in split("---", templatefile(manifest_file, {
@@ -38,13 +43,12 @@ locals {
       inference_server_framework                 = var.inference_server_framework
       ksa                                        = var.ksa
       latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
-      google_service_account                     = var.google_service_account
       max_num_prompts                            = var.max_num_prompts
       max_output_len                             = var.max_output_len
       max_prompt_len                             = var.max_prompt_len
       request_rates                              = join(",", [for number in var.request_rates : tostring(number)])
       tokenizer                                  = var.tokenizer
-      hugging_face_token_b64                     = var.hugging_face_token_b64
+      hugging_face_token_secret_list             = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
       k8s_hf_secret_list                         = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
       output_bucket                              = var.output_bucket
     })) : data]
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl
deleted file mode 100644
index d61629ee4..000000000
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl
+++ /dev/null
@@ -1,7 +0,0 @@
-apiVersion: v1
-kind: Secret
-metadata:
-  name: hf-token
-  namespace: ${namespace}
-data:
-  HF_TOKEN: ${hugging_face_token_b64}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
index c3fb03b83..72e5773a2 100644
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
+++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
@@ -41,11 +41,20 @@ spec:
               value: ${request_rates}
             - name: OUTPUT_BUCKET
               value: ${output_bucket}
+%{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
             - name: HF_TOKEN
               valueFrom:
                 secretKeyRef:
                   name: hf-token
                   key: HF_TOKEN
+%{ endfor ~}
+%{ for hf_token in k8s_hf_secret_list ~}
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: HF_TOKEN
+%{ endfor ~}
       nodeSelector:
         cloud.google.com/gke-accelerator: nvidia-l4   # nvidia-h100-80gb, nvidia-l4
         iam.gke.io/gke-metadata-server-enabled: "true"
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl
deleted file mode 100644
index 95e400737..000000000
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl
+++ /dev/null
@@ -1,4 +0,0 @@
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: ${namespace}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl
deleted file mode 100644
index 02eff324f..000000000
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl
+++ /dev/null
@@ -1,7 +0,0 @@
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: ${latency_profile_kubernetes_service_account}
-  namespace: ${namespace}
-  annotations:
-    iam.gke.io/gcp-service-account: "${google_service_account}@tpu-vm-gke-testing.iam.gserviceaccount.com"
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
index 896934cf0..935a5bab8 100644
--- a/benchmarks/benchmark/tools/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -143,13 +143,6 @@ variable "latency_profile_kubernetes_service_account" {
   default     = "sample-runner-ksa"
 }
 
-variable "google_service_account" {
-  description = "Google Service Account bound to the kubernetes service account"
-  type        = string
-  default     = ""
-  nullable    = false
-}
-
 // TODO: add validation to make k8s_hf_secret & hugging_face_secret mutually exclusive once terraform is updated with: https://discuss.hashicorp.com/t/experiment-feedback-input-variable-validation-can-cross-reference-other-objects/66644
 variable "k8s_hf_secret" {
   description = "Name of secret for huggingface token; stored in k8s "
@@ -158,59 +151,16 @@ variable "k8s_hf_secret" {
   default     = null
 }
 
-variable "hugging_face_token_b64" {
-  description = "Base 64 encoded hugging face token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
+variable "hugging_face_secret" {
+  description = "name of the kubectl huggingface secret token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
   type        = string
-  nullable    = false
+  nullable    = true
+  default     = null
 }
 
-variable "pipeline_config" {
-  description = "All combinations of model/model_server/accelerators to benchmark"
-  type = object({
-    valid_models       = list(string)
-    valid_accelerators = list(string)
-    request_rates      = list(number)
-
-    config = list(object({
-      model_server = string # Model server name
-      model_server_configs = list(object({
-        models = list(string) # model name
-        model_configs = list(object({
-          accelerators = list(string) # Accelerator name
-          accelerator_configs = list(object({
-            accelerator_count = number # Number of accelerators
-          }))
-        }))
-      }))
-    }))
-  })
-
-  validation {
-    condition = alltrue([
-      for cfg in var.pipeline_config.config : alltrue([
-        for model_server_config in cfg.model_server_configs : (
-          alltrue([
-            for model_config in model_server_config.model_configs :
-            alltrue([for accelerator in model_config.accelerators :
-            contains(var.pipeline_config.valid_accelerators, accelerator)])
-          ])
-        )
-      ])
-    ])
-    error_message = "Each accelerator must be in the valid_accelerators list."
-  }
-
-  validation {
-    condition = alltrue([
-      for cfg in var.pipeline_config.config : alltrue([
-        for model_server_config in cfg.model_server_configs : (
-          alltrue([
-            for model in model_server_config.models :
-            contains(var.pipeline_config.valid_models, model)
-          ])
-        )
-      ])
-    ])
-    error_message = "Each model must be in the valid_models list."
-  }
-}
\ No newline at end of file
+variable "hugging_face_secret_version" {
+  description = "Secret version in Secret Manager"
+  type        = string
+  nullable    = true
+  default     = null
+}
diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
index e69de29bb..e8ea340a2 100644
--- a/benchmarks/benchmark/tools/profile-generator/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/main.tf
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+module "latency-profile" {
+  source = "../latency-profile"
+
+  credentials_config                         = var.credentials_config
+  namespace                                  = var.namespace
+  project_id                                 = var.project_id
+  ksa                                        = var.ksa
+  templates_path                             = var.templates_path
+  artifact_registry                          = var.artifact_registry
+  inference_server_service                   = var.inference_server_service
+  inference_server_service_port              = var.inference_server_service_port
+  inference_server_framework                 = var.inference_server_framework
+  max_num_prompts                            = var.max_num_prompts
+  max_output_len                             = var.max_output_len
+  max_prompt_len                             = var.max_prompt_len
+  request_rates                              = var.request_rates
+  tokenizer                                  = var.tokenizer
+  output_bucket                              = var.output_bucket
+  latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
+  k8s_hf_secret                              = var.k8s_hf_secret
+  hugging_face_secret                        = var.hugging_face_secret
+  hugging_face_secret_version                = var.hugging_face_secret_version
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/profile-generator/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
new file mode 100644
index 000000000..907b76eef
--- /dev/null
+++ b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
@@ -0,0 +1,105 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+credentials_config = {
+  kubeconfig = {
+    path = "~/.kube/config"
+  }
+}
+
+project_id = "tpu-vm-gke-testing"
+
+
+# Latency profile generator service configuration
+artifact_registry                          = "us-central1-docker.pkg.dev/tpu-vm-gke-testing/ai-benchmark"
+inference_server_service                   = "maxengine-server" # inference server service name
+inference_server_service_port              = 8000
+latency_profile_kubernetes_service_account = "prom-frontend-sa"
+output_bucket                              = "tpu-vm-gke-testing-benchmark-output-bucket"
+k8s_hf_secret                              = "hf-token"
+
+# Benchmark configuration for Locust Docker accessing inference server
+inference_server_framework = "jetstream"
+tokenizer                  = "google/gemma-7b"
+request_rates              = [5, 10, 15, 20]
+
+profiles = {
+  valid_models = [
+    "gemma2-2b",
+    "gemma2-9b",
+    "gemma2-27b",
+    "llama3-8b",
+    "llama3-70b",
+    "llama3-405b"
+  ]
+  valid_accelerators = [
+    "tpu-v4-podslice",
+    "tpu-v5-lite-podslice",
+    "tpu-v5p-slice",
+    "nvidia-a100-80gb",
+    "nvidia-h100-80gb",
+    "nvidia-l4"
+  ]
+  request_rates = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
+
+  config = [{
+    model_server = "Jetstream"
+    model_server_configs = [{
+      models = [
+        "gemma2-2b",
+        "gemma2-9b",
+        "gemma2-27b"
+      ]
+      model_configs = []
+    }]
+    }, {
+    model_server = "vllm"
+    model_server_configs = [{
+      models = [
+        "gemma2-2b",
+        "gemma2-9b",
+        "gemma2-27b",
+        "llama3-8b",
+        "llama3-70b",
+        "llama3-405b"
+      ]
+      model_configs = []
+    }]
+    }, {
+    model_server = "tgi"
+    model_server_configs = [{
+      models = [
+        "gemma2-2b",
+        "gemma2-9b",
+        "gemma2-27b",
+        "llama3-8b",
+        "llama3-70b",
+        "llama3-405b"
+      ]
+      model_configs = []
+    }]
+    }, {
+    model_server = "tensorrt-llm"
+    model_server_configs = [{
+      models = [
+        "llama3-8b",
+        "llama3-70b",
+        "llama3-405b"
+      ]
+      model_configs = []
+    }]
+  }]
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf
index e69de29bb..dbbfddc1a 100644
--- a/benchmarks/benchmark/tools/profile-generator/variables.tf
+++ b/benchmarks/benchmark/tools/profile-generator/variables.tf
@@ -0,0 +1,218 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+variable "credentials_config" {
+  description = "Configure how Terraform authenticates to the cluster."
+  type = object({
+    fleet_host = optional(string)
+    kubeconfig = optional(object({
+      context = optional(string)
+      path    = optional(string, "~/.kube/config")
+    }))
+  })
+  nullable = false
+  validation {
+    condition = (
+      (var.credentials_config.fleet_host != null) !=
+      (var.credentials_config.kubeconfig != null)
+    )
+    error_message = "Exactly one of fleet host or kubeconfig must be set."
+  }
+}
+
+variable "namespace" {
+  description = "Namespace used for model and benchmarking deployments."
+  type        = string
+  nullable    = false
+  default     = "default"
+}
+
+variable "project_id" {
+  description = "Project id of existing or created project."
+  type        = string
+  nullable    = false
+}
+
+
+variable "ksa" {
+  description = "Kubernetes Service Account used for workload."
+  type        = string
+  nullable    = false
+  default     = "default"
+}
+
+variable "templates_path" {
+  description = "Path where manifest templates will be read from. Set to null to use the default manifests"
+  type        = string
+  default     = null
+}
+
+variable "artifact_registry" {
+  description = "Artifact registry for storing Latency Profile Generator container."
+  type        = string
+  default     = null
+}
+
+variable "inference_server_service" {
+  description = "Inference server service"
+  type        = string
+  nullable    = false
+}
+
+variable "inference_server_service_port" {
+  description = "Inference server service port"
+  type        = number
+  nullable    = false
+}
+
+variable "inference_server_framework" {
+  description = "Benchmark server configuration for inference server framework. Can be one of: vllm, tgi, tensorrt_llm_triton, sax"
+  type        = string
+  nullable    = false
+  default     = "tgi"
+  validation {
+    condition     = var.inference_server_framework == "vllm" || var.inference_server_framework == "tgi" || var.inference_server_framework == "tensorrt_llm_triton" || var.inference_server_framework == "sax" || var.inference_server_framework == "jetstream"
+    error_message = "The inference_server_framework must be one of: vllm, tgi, tensorrt_llm_triton, sax, or jetstream."
+  }
+}
+
+variable "max_num_prompts" {
+  description = "Benchmark server configuration for max number of prompts."
+  type        = number
+  default     = 1000
+  validation {
+    condition     = var.max_num_prompts > 0
+    error_message = "The max_num_prompts value must be greater than 0."
+  }
+}
+
+variable "max_output_len" {
+  description = "Benchmark server configuration for max output length."
+  type        = number
+  default     = 256
+  validation {
+    condition     = var.max_output_len > 4
+    error_message = "The max_output_len value must be greater than 4. TGI framework throws an error for too short of sequences."
+  }
+}
+
+variable "max_prompt_len" {
+  description = "Benchmark server configuration for max prompt length."
+  type        = number
+  default     = 256
+  validation {
+    condition     = var.max_prompt_len > 4
+    error_message = "The max_prompt_len value must be greater than 4. TGI framework throws an error for too short of sequences."
+  }
+}
+
+variable "request_rates" {
+  description = ""
+  type        = list(number)
+  default     = [1, 2]
+  nullable    = false
+}
+
+variable "tokenizer" {
+  description = "Benchmark server configuration for tokenizer."
+  type        = string
+  nullable    = false
+  default     = "tiiuae/falcon-7b"
+}
+
+variable "output_bucket" {
+  description = "Bucket name for storing results"
+  type        = string
+}
+
+variable "latency_profile_kubernetes_service_account" {
+  description = "Kubernetes Service Account to be used for the latency profile generator tool"
+  type        = string
+  default     = "sample-runner-ksa"
+}
+
+// TODO: add validation to make k8s_hf_secret & hugging_face_secret mutually exclusive once terraform is updated with: https://discuss.hashicorp.com/t/experiment-feedback-input-variable-validation-can-cross-reference-other-objects/66644
+variable "k8s_hf_secret" {
+  description = "Name of secret for huggingface token; stored in k8s "
+  type        = string
+  nullable    = true
+  default     = null
+}
+
+variable "hugging_face_secret" {
+  description = "name of the kubectl huggingface secret token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
+  type        = string
+  nullable    = true
+  default     = null
+}
+
+variable "hugging_face_secret_version" {
+  description = "Secret version in Secret Manager"
+  type        = string
+  nullable    = true
+  default     = null
+}
+
+variable "profiles" {
+  description = "Model servers to benchmark"
+  type = object({
+    valid_models       = list(string)
+    valid_accelerators = list(string)
+    request_rates      = list(number)
+
+    config = list(object({
+      model_server = string # Model server name
+      model_server_configs = list(object({
+        models = list(string) # model name
+        model_configs = list(object({
+          accelerators = list(string) # Accelerator name
+          accelerator_configs = list(object({
+            accelerator_count = number # Number of accelerators
+          }))
+        }))
+      }))
+    }))
+  })
+
+  validation {
+    condition = alltrue([
+      for cfg in var.profiles.config : alltrue([
+        for model_server_config in cfg.model_server_configs : (
+          alltrue([
+            for model_config in model_server_config.model_configs :
+            alltrue([for accelerator in model_config.accelerators :
+            contains(var.profiles.valid_accelerators, accelerator)])
+          ])
+        )
+      ])
+    ])
+    error_message = "Each accelerator must be in the valid_accelerators list."
+  }
+
+  validation {
+    condition = alltrue([
+      for cfg in var.profiles.config : alltrue([
+        for model_server_config in cfg.model_server_configs : (
+          alltrue([
+            for model in model_server_config.models :
+            contains(var.profiles.valid_models, model)
+          ])
+        )
+      ])
+    ])
+    error_message = "Each model must be in the valid_models list."
+  }
+}
\ No newline at end of file

From 545531d9dd29158e061e38c907481f5dc2fa4810 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Fri, 16 Aug 2024 22:24:48 +0000
Subject: [PATCH 11/37] setting for building latency profiler image

---
 benchmarks/benchmark/tools/latency-profile/build.tf        | 2 +-
 benchmarks/benchmark/tools/latency-profile/main.tf         | 6 +-----
 benchmarks/benchmark/tools/latency-profile/variables.tf    | 6 ++++++
 benchmarks/benchmark/tools/profile-generator/main.tf       | 7 +++++++
 benchmarks/benchmark/tools/profile-generator/sample.tfvars | 1 +
 benchmarks/benchmark/tools/profile-generator/variables.tf  | 6 ++++++
 6 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/benchmarks/benchmark/tools/latency-profile/build.tf b/benchmarks/benchmark/tools/latency-profile/build.tf
index 784cf09f7..89deb3744 100644
--- a/benchmarks/benchmark/tools/latency-profile/build.tf
+++ b/benchmarks/benchmark/tools/latency-profile/build.tf
@@ -1,5 +1,5 @@
 resource "null_resource" "build_and_push_image" {
-
+  count = var.build_latency_profile_generator_image ? 1 : 0
   depends_on = [resource.google_project_service.cloudbuild]
   provisioner "local-exec" {
     working_dir = path.module
diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
index be95ee59e..c2d03e7ee 100644
--- a/benchmarks/benchmark/tools/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -13,11 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-## BEFORE APPLYING TEMPLATES
-
-# 1) Assure that we need to upload the new data point if either there is none of the existing one is unsatisfactory
-# 2) Use the `catalog generate` tool to generate the manifests and pipe them to `kubectl apply -f`, assure kubectl succeeds
 locals {
   templates = [
     for f in fileset(local.templates_path, "*tpl") :
@@ -56,6 +51,7 @@ locals {
 }
 
 resource "google_project_service" "cloudbuild" {
+  count = var.build_latency_profile_generator_image ? 1 : 0
   project = var.project_id
   service = "cloudbuild.googleapis.com"
 
diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
index 935a5bab8..9a5a58e73 100644
--- a/benchmarks/benchmark/tools/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -65,6 +65,12 @@ variable "artifact_registry" {
   default     = null
 }
 
+variable "build_latency_profile_generator_image" {
+  description = "Whether latency profile generator image will be built or not"
+  type = bool
+  default = true
+}
+
 variable "inference_server_service" {
   description = "Inference server service"
   type        = string
diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
index e8ea340a2..7d4c18542 100644
--- a/benchmarks/benchmark/tools/profile-generator/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/main.tf
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+## BEFORE APPLYING TEMPLATES
+
+# 1) Assure that we need to upload the new data point if either there is none of the existing one is unsatisfactory
+# 2) Use the `catalog generate` tool to generate the manifests and pipe them to `kubectl apply -f`, assure kubectl succeeds
+
+
 module "latency-profile" {
   source = "../latency-profile"
 
@@ -23,6 +29,7 @@ module "latency-profile" {
   ksa                                        = var.ksa
   templates_path                             = var.templates_path
   artifact_registry                          = var.artifact_registry
+  build_latency_profile_generator_image      = var.build_latency_profile_generator_image
   inference_server_service                   = var.inference_server_service
   inference_server_service_port              = var.inference_server_service_port
   inference_server_framework                 = var.inference_server_framework
diff --git a/benchmarks/benchmark/tools/profile-generator/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
index 907b76eef..99e8466cf 100644
--- a/benchmarks/benchmark/tools/profile-generator/sample.tfvars
+++ b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
@@ -25,6 +25,7 @@ project_id = "tpu-vm-gke-testing"
 
 # Latency profile generator service configuration
 artifact_registry                          = "us-central1-docker.pkg.dev/tpu-vm-gke-testing/ai-benchmark"
+build_latency_profile_generator_image      = false
 inference_server_service                   = "maxengine-server" # inference server service name
 inference_server_service_port              = 8000
 latency_profile_kubernetes_service_account = "prom-frontend-sa"
diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf
index dbbfddc1a..202af501f 100644
--- a/benchmarks/benchmark/tools/profile-generator/variables.tf
+++ b/benchmarks/benchmark/tools/profile-generator/variables.tf
@@ -66,6 +66,12 @@ variable "artifact_registry" {
   default     = null
 }
 
+variable "build_latency_profile_generator_image" {
+  description = "Whether latency profile generator image will be built or not"
+  type = bool
+  default = true
+}
+
 variable "inference_server_service" {
   description = "Inference server service"
   type        = string

From 8f2ea1ca5d95b68b90ba4242044aadfd5ce593fd Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Fri, 16 Aug 2024 22:25:59 +0000
Subject: [PATCH 12/37] onoly build once for profile-generator

---
 .../benchmark/tools/profile-generator/build.tf    |  8 ++++++++
 .../benchmark/tools/profile-generator/main.tf     | 15 ++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/benchmark/tools/profile-generator/build.tf

diff --git a/benchmarks/benchmark/tools/profile-generator/build.tf b/benchmarks/benchmark/tools/profile-generator/build.tf
new file mode 100644
index 000000000..89deb3744
--- /dev/null
+++ b/benchmarks/benchmark/tools/profile-generator/build.tf
@@ -0,0 +1,8 @@
+resource "null_resource" "build_and_push_image" {
+  count = var.build_latency_profile_generator_image ? 1 : 0
+  depends_on = [resource.google_project_service.cloudbuild]
+  provisioner "local-exec" {
+    working_dir = path.module
+    command     = "gcloud builds submit --tag ${var.artifact_registry}/latency-profile:latest container"
+  }
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
index 7d4c18542..ccb1fbef6 100644
--- a/benchmarks/benchmark/tools/profile-generator/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/main.tf
@@ -20,6 +20,19 @@
 # 2) Use the `catalog generate` tool to generate the manifests and pipe them to `kubectl apply -f`, assure kubectl succeeds
 
 
+resource "google_project_service" "cloudbuild" {
+  count = var.build_latency_profile_generator_image ? 1 : 0
+  project = var.project_id
+  service = "cloudbuild.googleapis.com"
+
+  timeouts {
+    create = "30m"
+    update = "40m"
+  }
+
+  disable_on_destroy = false
+}
+
 module "latency-profile" {
   source = "../latency-profile"
 
@@ -29,7 +42,7 @@ module "latency-profile" {
   ksa                                        = var.ksa
   templates_path                             = var.templates_path
   artifact_registry                          = var.artifact_registry
-  build_latency_profile_generator_image      = var.build_latency_profile_generator_image
+  build_latency_profile_generator_image      = false
   inference_server_service                   = var.inference_server_service
   inference_server_service_port              = var.inference_server_service_port
   inference_server_framework                 = var.inference_server_framework

From 29556cbf7f538ea9e7e30c811e9dfe838672d1e0 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Fri, 16 Aug 2024 22:26:08 +0000
Subject: [PATCH 13/37] fmt

---
 benchmarks/benchmark/tools/profile-generator/build.tf     | 2 +-
 benchmarks/benchmark/tools/profile-generator/main.tf      | 2 +-
 benchmarks/benchmark/tools/profile-generator/variables.tf | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/build.tf b/benchmarks/benchmark/tools/profile-generator/build.tf
index 89deb3744..2f5e17cd3 100644
--- a/benchmarks/benchmark/tools/profile-generator/build.tf
+++ b/benchmarks/benchmark/tools/profile-generator/build.tf
@@ -1,5 +1,5 @@
 resource "null_resource" "build_and_push_image" {
-  count = var.build_latency_profile_generator_image ? 1 : 0
+  count      = var.build_latency_profile_generator_image ? 1 : 0
   depends_on = [resource.google_project_service.cloudbuild]
   provisioner "local-exec" {
     working_dir = path.module
diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
index ccb1fbef6..037a42268 100644
--- a/benchmarks/benchmark/tools/profile-generator/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/main.tf
@@ -21,7 +21,7 @@
 
 
 resource "google_project_service" "cloudbuild" {
-  count = var.build_latency_profile_generator_image ? 1 : 0
+  count   = var.build_latency_profile_generator_image ? 1 : 0
   project = var.project_id
   service = "cloudbuild.googleapis.com"
 
diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf
index 202af501f..ff427a714 100644
--- a/benchmarks/benchmark/tools/profile-generator/variables.tf
+++ b/benchmarks/benchmark/tools/profile-generator/variables.tf
@@ -68,8 +68,8 @@ variable "artifact_registry" {
 
 variable "build_latency_profile_generator_image" {
   description = "Whether latency profile generator image will be built or not"
-  type = bool
-  default = true
+  type        = bool
+  default     = true
 }
 
 variable "inference_server_service" {

From 6a77639da82ef34a72d4f1424e18198240c51650 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Fri, 16 Aug 2024 22:32:23 +0000
Subject: [PATCH 14/37] fmt

---
 benchmarks/benchmark/tools/latency-profile/build.tf     | 2 +-
 benchmarks/benchmark/tools/latency-profile/main.tf      | 2 +-
 benchmarks/benchmark/tools/latency-profile/variables.tf | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark/tools/latency-profile/build.tf b/benchmarks/benchmark/tools/latency-profile/build.tf
index 89deb3744..2f5e17cd3 100644
--- a/benchmarks/benchmark/tools/latency-profile/build.tf
+++ b/benchmarks/benchmark/tools/latency-profile/build.tf
@@ -1,5 +1,5 @@
 resource "null_resource" "build_and_push_image" {
-  count = var.build_latency_profile_generator_image ? 1 : 0
+  count      = var.build_latency_profile_generator_image ? 1 : 0
   depends_on = [resource.google_project_service.cloudbuild]
   provisioner "local-exec" {
     working_dir = path.module
diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
index c2d03e7ee..f9b9cfcb1 100644
--- a/benchmarks/benchmark/tools/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -51,7 +51,7 @@ locals {
 }
 
 resource "google_project_service" "cloudbuild" {
-  count = var.build_latency_profile_generator_image ? 1 : 0
+  count   = var.build_latency_profile_generator_image ? 1 : 0
   project = var.project_id
   service = "cloudbuild.googleapis.com"
 
diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
index 9a5a58e73..b63e10f34 100644
--- a/benchmarks/benchmark/tools/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -67,8 +67,8 @@ variable "artifact_registry" {
 
 variable "build_latency_profile_generator_image" {
   description = "Whether latency profile generator image will be built or not"
-  type = bool
-  default = true
+  type        = bool
+  default     = true
 }
 
 variable "inference_server_service" {

From 6b15b61a356d492c9caa00e9474f49b95c622c27 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Mon, 19 Aug 2024 22:21:53 +0000
Subject: [PATCH 15/37] Move deploy model server from profile-generator to
 latency-profile

---
 .../container/latency_throughput_curve.sh     |  1 +
 .../benchmark/tools/latency-profile/main.tf   | 32 +++++++++++++---
 .../latency-profile-generator.yaml.tpl        | 15 +++-----
 .../tools/latency-profile/sample.tfvars       | 31 ++++++++++++++--
 .../tools/latency-profile/variables.tf        | 30 +++++++--------
 .../tools/locust-load-inference/main.tf       |  2 +-
 .../benchmark/tools/profile-generator/main.tf | 13 ++-----
 .../tools/profile-generator/sample.tfvars     | 15 ++++++--
 .../tools/profile-generator/variables.tf      | 37 ++++++-------------
 9 files changed, 102 insertions(+), 74 deletions(-)

diff --git a/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
index e7ae88b8a..ec5a838f0 100755
--- a/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
+++ b/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
@@ -20,6 +20,7 @@ export IP=$IP
 huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential
 
 for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do
+  # TODO: Check if profile already exists, if so then skip
   timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
   output_file="latency-profile-${timestamp}.txt"
   python3 benchmark_serving.py   --host="$IP"   --port="$PORT"   --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
index f9b9cfcb1..7959281a1 100644
--- a/benchmarks/benchmark/tools/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -33,16 +33,16 @@ locals {
     [for data in split("---", templatefile(manifest_file, {
       artifact_registry                          = var.artifact_registry
       namespace                                  = var.namespace
-      inference_server_service                   = var.inference_server_service
-      inference_server_service_port              = var.inference_server_service_port
-      inference_server_framework                 = var.inference_server_framework
+      inference_server_framework                 = var.inference_server.name
+      inference_server_service                   = var.inference_server.service.name
+      inference_server_service_port              = var.inference_server.service.port
+      tokenizer                                  = var.inference_server.tokenizer
       ksa                                        = var.ksa
       latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
       max_num_prompts                            = var.max_num_prompts
       max_output_len                             = var.max_output_len
       max_prompt_len                             = var.max_prompt_len
       request_rates                              = join(",", [for number in var.request_rates : tostring(number)])
-      tokenizer                                  = var.tokenizer
       hugging_face_token_secret_list             = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
       k8s_hf_secret_list                         = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
       output_bucket                              = var.output_bucket
@@ -63,11 +63,31 @@ resource "google_project_service" "cloudbuild" {
   disable_on_destroy = false
 }
 
-resource "kubernetes_manifest" "default" {
+resource "null_resource" "deploy_model_server" {
+  count = var.inference_server.deploy ? 1 : 0
+  provisioner "local-exec" {
+    command = "echo hello"
+  }
+  triggers = {
+    always_run = "${timestamp()}"
+  }
+}
+
+resource "kubernetes_manifest" "deploy_latency_profile_generator" {
   for_each   = toset(local.all_manifests)
-  depends_on = [resource.null_resource.build_and_push_image]
+  depends_on = [resource.null_resource.build_and_push_image, resource.null_resource.deploy_model_server]
   manifest   = yamldecode(each.value)
   timeouts {
     create = "30m"
   }
 }
+
+resource "null_resource" "cleanup_model_server" {
+  depends_on = [ resource.kubernetes_manifest.deploy_latency_profile_generator ]
+  provisioner "local-exec" {
+    command = "kubectl wait --for=condition=complete job/lantency-profile-generator-test && echo hello"
+  }
+  triggers = {
+    always_run = "${timestamp()}"
+  }
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
index 72e5773a2..101b23127 100644
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
+++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
@@ -1,15 +1,11 @@
-apiVersion: "apps/v1"
-kind: "Deployment"
+apiVersion: batch/v1
+kind: Job
 metadata:
-  name: lantency-profile-generator
+  name: latency-profile-generator
   namespace: ${namespace}
   labels:
-    name: lantency-profile-generator
+    name: latency-profile-generator
 spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: lantency-profile-generator
   template:
     metadata:
       labels:
@@ -17,8 +13,9 @@ spec:
         examples.ai.gke.io/source: ai-on-gke-benchmarks
     spec:
       serviceAccountName: ${latency_profile_kubernetes_service_account}
+      restartPolicy: Never
       containers:
-        - name: lantency-profile-generator
+        - name: latency-profile-generator
           image: ${artifact_registry}/latency-profile:latest
           resources:
             limits:
diff --git a/benchmarks/benchmark/tools/latency-profile/sample.tfvars b/benchmarks/benchmark/tools/latency-profile/sample.tfvars
index d0cf45dfb..3a24d0aa7 100644
--- a/benchmarks/benchmark/tools/latency-profile/sample.tfvars
+++ b/benchmarks/benchmark/tools/latency-profile/sample.tfvars
@@ -1,3 +1,19 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
 credentials_config = {
   fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUM/locations/global/gkeMemberships/ai-benchmark"
 }
@@ -11,11 +27,20 @@ k8s_hf_secret = "hf-token"
 
 # Latency profile generator service configuration
 artifact_registry                          = "us-central1-docker.pkg.dev/$PROJECT_ID/ai-benchmark"
-inference_server_service                   = "tgi" # inference server service name
 latency_profile_kubernetes_service_account = "sample-runner-ksa"
 output_bucket                              = "${PROJECT_ID}-benchmark-output"
 gcs_path                                   = "gs://${PROJECT_ID}-ai-gke-benchmark-fuse/ShareGPT_V3_unfiltered_cleaned_split_filtered_prompts.txt"
 
+# Inference server configuration
+inference_server = {
+  deploy = false
+  name = "tgi"
+  tokenizer = "tiiuae/falcon-7b"
+  service = {
+    name = "tgi", # inference server service name
+    port = 8000
+  }
+}
+
 # Benchmark configuration for Locust Docker accessing inference server
-inference_server_framework = "tgi"
-tokenizer                  = "tiiuae/falcon-7b"
\ No newline at end of file
+request_rates              = [5, 10, 15, 20]
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
index b63e10f34..4679e884e 100644
--- a/benchmarks/benchmark/tools/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -71,25 +71,21 @@ variable "build_latency_profile_generator_image" {
   default     = true
 }
 
-variable "inference_server_service" {
-  description = "Inference server service"
-  type        = string
-  nullable    = false
-}
-
-variable "inference_server_service_port" {
-  description = "Inference server service port"
-  type        = number
-  nullable    = false
-}
+# Inference server configuration
+variable "inference_server" {
+  type = object({
+    deploy = optional(bool), # Do you want this module to deploy the model server?
+    name   = string,
+    tokenizer = string,
+    service = object({
+      name = string,
+      port = number,
+    })
+  })
+  nullable = false
 
-variable "inference_server_framework" {
-  description = "Benchmark server configuration for inference server framework. Can be one of: vllm, tgi, tensorrt_llm_triton, sax"
-  type        = string
-  nullable    = false
-  default     = "tgi"
   validation {
-    condition     = var.inference_server_framework == "vllm" || var.inference_server_framework == "tgi" || var.inference_server_framework == "tensorrt_llm_triton" || var.inference_server_framework == "sax" || var.inference_server_framework == "jetstream"
+    condition     = var.inference_server.name == "vllm" || var.inference_server.name == "tgi" || var.inference_server.name == "tensorrt_llm_triton" || var.inference_server.name == "sax" || var.inference_server.name == "jetstream"
     error_message = "The inference_server_framework must be one of: vllm, tgi, tensorrt_llm_triton, sax, or jetstream."
   }
 }
diff --git a/benchmarks/benchmark/tools/locust-load-inference/main.tf b/benchmarks/benchmark/tools/locust-load-inference/main.tf
index ff557f9ee..6d5fcf65a 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/main.tf
+++ b/benchmarks/benchmark/tools/locust-load-inference/main.tf
@@ -34,7 +34,7 @@ locals {
     [for data in split("---", templatefile(manifest_file, {
       artifact_registry              = var.artifact_registry
       namespace                      = var.namespace
-      inference_server_service       = var.inference_server_service
+      inference_server_service       = var.inference_server.service
       inference_server_framework     = var.inference_server_framework
       best_of                        = var.best_of
       gcs_path                       = var.gcs_path
diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
index 037a42268..6e668af1d 100644
--- a/benchmarks/benchmark/tools/profile-generator/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/main.tf
@@ -14,10 +14,6 @@
  * limitations under the License.
  */
 
-## BEFORE APPLYING TEMPLATES
-
-# 1) Assure that we need to upload the new data point if either there is none of the existing one is unsatisfactory
-# 2) Use the `catalog generate` tool to generate the manifests and pipe them to `kubectl apply -f`, assure kubectl succeeds
 
 
 resource "google_project_service" "cloudbuild" {
@@ -33,6 +29,8 @@ resource "google_project_service" "cloudbuild" {
   disable_on_destroy = false
 }
 
+# CREATE NODEPOOLS
+
 module "latency-profile" {
   source = "../latency-profile"
 
@@ -42,15 +40,12 @@ module "latency-profile" {
   ksa                                        = var.ksa
   templates_path                             = var.templates_path
   artifact_registry                          = var.artifact_registry
-  build_latency_profile_generator_image      = false
-  inference_server_service                   = var.inference_server_service
-  inference_server_service_port              = var.inference_server_service_port
-  inference_server_framework                 = var.inference_server_framework
+  build_latency_profile_generator_image      = false # Dont build image for each profile generator instance, only need to do once.
+  inference_server                           = var.inference_server
   max_num_prompts                            = var.max_num_prompts
   max_output_len                             = var.max_output_len
   max_prompt_len                             = var.max_prompt_len
   request_rates                              = var.request_rates
-  tokenizer                                  = var.tokenizer
   output_bucket                              = var.output_bucket
   latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
   k8s_hf_secret                              = var.k8s_hf_secret
diff --git a/benchmarks/benchmark/tools/profile-generator/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
index 99e8466cf..3fde985ac 100644
--- a/benchmarks/benchmark/tools/profile-generator/sample.tfvars
+++ b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
@@ -26,15 +26,22 @@ project_id = "tpu-vm-gke-testing"
 # Latency profile generator service configuration
 artifact_registry                          = "us-central1-docker.pkg.dev/tpu-vm-gke-testing/ai-benchmark"
 build_latency_profile_generator_image      = false
-inference_server_service                   = "maxengine-server" # inference server service name
-inference_server_service_port              = 8000
 latency_profile_kubernetes_service_account = "prom-frontend-sa"
 output_bucket                              = "tpu-vm-gke-testing-benchmark-output-bucket"
 k8s_hf_secret                              = "hf-token"
 
+# Inference server configuration
+inference_server = {
+  deploy = false
+  name = "jetstream"
+  tokenizer = "google/gemma-7b"
+  service = {
+    name = "maxengine-server", # inference server service name
+    port = 8000
+  }
+}
+
 # Benchmark configuration for Locust Docker accessing inference server
-inference_server_framework = "jetstream"
-tokenizer                  = "google/gemma-7b"
 request_rates              = [5, 10, 15, 20]
 
 profiles = {
diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf
index ff427a714..9d4030254 100644
--- a/benchmarks/benchmark/tools/profile-generator/variables.tf
+++ b/benchmarks/benchmark/tools/profile-generator/variables.tf
@@ -46,7 +46,6 @@ variable "project_id" {
   nullable    = false
 }
 
-
 variable "ksa" {
   description = "Kubernetes Service Account used for workload."
   type        = string
@@ -72,25 +71,20 @@ variable "build_latency_profile_generator_image" {
   default     = true
 }
 
-variable "inference_server_service" {
-  description = "Inference server service"
-  type        = string
-  nullable    = false
-}
-
-variable "inference_server_service_port" {
-  description = "Inference server service port"
-  type        = number
-  nullable    = false
-}
+variable "inference_server" {
+  type = object({
+    deploy = optional(bool), # Do you want this module to deploy the model server?
+    name   = string,
+    tokenizer = string, # Benchmark server configuration for tokenizer
+    service = object({
+      name = string,
+      port = number,
+    })
+  })
+  nullable = false
 
-variable "inference_server_framework" {
-  description = "Benchmark server configuration for inference server framework. Can be one of: vllm, tgi, tensorrt_llm_triton, sax"
-  type        = string
-  nullable    = false
-  default     = "tgi"
   validation {
-    condition     = var.inference_server_framework == "vllm" || var.inference_server_framework == "tgi" || var.inference_server_framework == "tensorrt_llm_triton" || var.inference_server_framework == "sax" || var.inference_server_framework == "jetstream"
+    condition     = var.inference_server.name == "vllm" || var.inference_server.name == "tgi" || var.inference_server.name == "tensorrt_llm_triton" || var.inference_server.name == "sax" || var.inference_server.name == "jetstream"
     error_message = "The inference_server_framework must be one of: vllm, tgi, tensorrt_llm_triton, sax, or jetstream."
   }
 }
@@ -132,13 +126,6 @@ variable "request_rates" {
   nullable    = false
 }
 
-variable "tokenizer" {
-  description = "Benchmark server configuration for tokenizer."
-  type        = string
-  nullable    = false
-  default     = "tiiuae/falcon-7b"
-}
-
 variable "output_bucket" {
   description = "Bucket name for storing results"
   type        = string

From a295aa9e1fa7e78e8dcd1731d823206ae3b1e75b Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Mon, 19 Aug 2024 23:04:09 +0000
Subject: [PATCH 16/37] fix kubectl wait

---
 benchmarks/benchmark/tools/latency-profile/main.tf            | 2 +-
 .../manifest-templates/latency-profile-generator.yaml.tpl     | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
index 7959281a1..88415b9b8 100644
--- a/benchmarks/benchmark/tools/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -85,7 +85,7 @@ resource "kubernetes_manifest" "deploy_latency_profile_generator" {
 resource "null_resource" "cleanup_model_server" {
   depends_on = [ resource.kubernetes_manifest.deploy_latency_profile_generator ]
   provisioner "local-exec" {
-    command = "kubectl wait --for=condition=complete job/lantency-profile-generator-test && echo hello"
+    command = "kubectl wait --for=condition=complete job/latency-profile-generator --timeout=-9600s && echo hello"
   }
   triggers = {
     always_run = "${timestamp()}"
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
index 101b23127..fb6e1d9e1 100644
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
+++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
@@ -7,10 +7,6 @@ metadata:
     name: latency-profile-generator
 spec:
   template:
-    metadata:
-      labels:
-        app: lantency-profile-generator
-        examples.ai.gke.io/source: ai-on-gke-benchmarks
     spec:
       serviceAccountName: ${latency_profile_kubernetes_service_account}
       restartPolicy: Never

From f4e76b20d0020c17df479dfe9076760cce6fe630 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Mon, 19 Aug 2024 23:06:50 +0000
Subject: [PATCH 17/37] dmt

---
 benchmarks/benchmark/tools/latency-profile/main.tf        | 2 +-
 benchmarks/benchmark/tools/latency-profile/sample.tfvars  | 8 ++++----
 benchmarks/benchmark/tools/latency-profile/variables.tf   | 4 ++--
 .../benchmark/tools/profile-generator/sample.tfvars       | 6 +++---
 benchmarks/benchmark/tools/profile-generator/variables.tf | 4 ++--
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
index 88415b9b8..8f4714d82 100644
--- a/benchmarks/benchmark/tools/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -83,7 +83,7 @@ resource "kubernetes_manifest" "deploy_latency_profile_generator" {
 }
 
 resource "null_resource" "cleanup_model_server" {
-  depends_on = [ resource.kubernetes_manifest.deploy_latency_profile_generator ]
+  depends_on = [resource.kubernetes_manifest.deploy_latency_profile_generator]
   provisioner "local-exec" {
     command = "kubectl wait --for=condition=complete job/latency-profile-generator --timeout=-9600s && echo hello"
   }
diff --git a/benchmarks/benchmark/tools/latency-profile/sample.tfvars b/benchmarks/benchmark/tools/latency-profile/sample.tfvars
index 3a24d0aa7..e5d45f3e3 100644
--- a/benchmarks/benchmark/tools/latency-profile/sample.tfvars
+++ b/benchmarks/benchmark/tools/latency-profile/sample.tfvars
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
- 
+
 credentials_config = {
   fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUM/locations/global/gkeMemberships/ai-benchmark"
 }
@@ -33,8 +33,8 @@ gcs_path                                   = "gs://${PROJECT_ID}-ai-gke-benchmar
 
 # Inference server configuration
 inference_server = {
-  deploy = false
-  name = "tgi"
+  deploy    = false
+  name      = "tgi"
   tokenizer = "tiiuae/falcon-7b"
   service = {
     name = "tgi", # inference server service name
@@ -43,4 +43,4 @@ inference_server = {
 }
 
 # Benchmark configuration for Locust Docker accessing inference server
-request_rates              = [5, 10, 15, 20]
\ No newline at end of file
+request_rates = [5, 10, 15, 20]
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
index 4679e884e..5bbd2c1e8 100644
--- a/benchmarks/benchmark/tools/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -74,8 +74,8 @@ variable "build_latency_profile_generator_image" {
 # Inference server configuration
 variable "inference_server" {
   type = object({
-    deploy = optional(bool), # Do you want this module to deploy the model server?
-    name   = string,
+    deploy    = optional(bool), # Do you want this module to deploy the model server?
+    name      = string,
     tokenizer = string,
     service = object({
       name = string,
diff --git a/benchmarks/benchmark/tools/profile-generator/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
index 3fde985ac..b28cf8807 100644
--- a/benchmarks/benchmark/tools/profile-generator/sample.tfvars
+++ b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
@@ -32,8 +32,8 @@ k8s_hf_secret                              = "hf-token"
 
 # Inference server configuration
 inference_server = {
-  deploy = false
-  name = "jetstream"
+  deploy    = false
+  name      = "jetstream"
   tokenizer = "google/gemma-7b"
   service = {
     name = "maxengine-server", # inference server service name
@@ -42,7 +42,7 @@ inference_server = {
 }
 
 # Benchmark configuration for Locust Docker accessing inference server
-request_rates              = [5, 10, 15, 20]
+request_rates = [5, 10, 15, 20]
 
 profiles = {
   valid_models = [
diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf
index 9d4030254..7a4dcea7f 100644
--- a/benchmarks/benchmark/tools/profile-generator/variables.tf
+++ b/benchmarks/benchmark/tools/profile-generator/variables.tf
@@ -73,8 +73,8 @@ variable "build_latency_profile_generator_image" {
 
 variable "inference_server" {
   type = object({
-    deploy = optional(bool), # Do you want this module to deploy the model server?
-    name   = string,
+    deploy    = optional(bool), # Do you want this module to deploy the model server?
+    name      = string,
     tokenizer = string, # Benchmark server configuration for tokenizer
     service = object({
       name = string,

From ccc512f437f4b732b34f14ae6701387f7a465ad5 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Tue, 20 Aug 2024 22:58:47 +0000
Subject: [PATCH 18/37] intermediate changes

---
 .../benchmark/tools/latency-profile/main.tf   | 15 ++++++
 .../latency-profile-generator.yaml.tpl        |  2 +-
 .../tools/latency-profile/providers.tf        | 36 -------------
 .../tools/latency-profile/variables.tf        |  5 ++
 .../benchmark/tools/profile-generator/main.tf | 52 ++++++++++++++++++-
 .../tools/profile-generator/sample.tfvars     | 23 ++++----
 .../tools/profile-generator/variables.tf      | 18 -------
 7 files changed, 81 insertions(+), 70 deletions(-)
 delete mode 100644 benchmarks/benchmark/tools/latency-profile/providers.tf

diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
index 8f4714d82..7460e1229 100644
--- a/benchmarks/benchmark/tools/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -31,6 +31,7 @@ locals {
 
   all_manifests = flatten([for manifest_file in local.templates :
     [for data in split("---", templatefile(manifest_file, {
+      combo                                      = format("%s-%s-%s-%s", var.inference_server.name, var.inference_server.model, var.inference_server.accelerator_config.type, var.inference_server.accelerator_config.count)
       artifact_registry                          = var.artifact_registry
       namespace                                  = var.namespace
       inference_server_framework                 = var.inference_server.name
@@ -50,6 +51,20 @@ locals {
   ])
 }
 
+terraform {
+  required_providers {
+    kubernetes = {
+      source  = "hashicorp/kubernetes"
+      version = ">= 2.0"
+    }
+  }
+}
+
+data "google_client_config" "identity" {
+  count = var.credentials_config.fleet_host != null ? 1 : 0
+}
+
+
 resource "google_project_service" "cloudbuild" {
   count   = var.build_latency_profile_generator_image ? 1 : 0
   project = var.project_id
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
index fb6e1d9e1..697f57911 100644
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
+++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
@@ -1,7 +1,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: latency-profile-generator
+  name: lpg-${combo}
   namespace: ${namespace}
   labels:
     name: latency-profile-generator
diff --git a/benchmarks/benchmark/tools/latency-profile/providers.tf b/benchmarks/benchmark/tools/latency-profile/providers.tf
deleted file mode 100644
index 70c82e817..000000000
--- a/benchmarks/benchmark/tools/latency-profile/providers.tf
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Copyright 2024 Google LLC
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-data "google_client_config" "identity" {
-  count = var.credentials_config.fleet_host != null ? 1 : 0
-}
-
-provider "kubernetes" {
-  config_path = (
-    var.credentials_config.kubeconfig == null
-    ? null
-    : pathexpand(var.credentials_config.kubeconfig.path)
-  )
-  config_context = try(
-    var.credentials_config.kubeconfig.context, null
-  )
-  host = (
-    var.credentials_config.fleet_host == null
-    ? null
-    : var.credentials_config.fleet_host
-  )
-  token = try(data.google_client_config.identity.0.access_token, null)
-}
diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
index 5bbd2c1e8..4f444c64e 100644
--- a/benchmarks/benchmark/tools/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -76,11 +76,16 @@ variable "inference_server" {
   type = object({
     deploy    = optional(bool), # Do you want this module to deploy the model server?
     name      = string,
+    model     = string,
     tokenizer = string,
     service = object({
       name = string,
       port = number,
     })
+    accelerator_config = object({
+      type = string,
+      count = number,
+    })
   })
   nullable = false
 
diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
index 6e668af1d..be2545e83 100644
--- a/benchmarks/benchmark/tools/profile-generator/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/main.tf
@@ -14,6 +14,27 @@
  * limitations under the License.
  */
 
+provider "kubernetes" {
+  config_path = (
+    var.credentials_config.kubeconfig == null
+    ? null
+    : pathexpand(var.credentials_config.kubeconfig.path)
+  )
+  config_context = try(
+    var.credentials_config.kubeconfig.context, null
+  )
+  host = (
+    var.credentials_config.fleet_host == null
+    ? null
+    : var.credentials_config.fleet_host
+  )
+  token = try(data.google_client_config.identity.0.access_token, null)
+}
+
+data "google_client_config" "identity" {
+  count = var.credentials_config.fleet_host != null ? 1 : 0
+}
+
 
 
 resource "google_project_service" "cloudbuild" {
@@ -32,6 +53,22 @@ resource "google_project_service" "cloudbuild" {
 # CREATE NODEPOOLS
 
 module "latency-profile" {
+  for_each = toset(
+    flatten([
+      for config in toset(var.profiles.config): toset([
+        for model_server_config in toset(config.model_server_configs): toset([
+          for model in toset(model_server_config.models): toset([
+            for model_config in toset(model_server_config.model_configs): toset([
+              for accelerator in toset(model_config.accelerators): toset([
+                for accelerator_config in toset(model_config.accelerator_configs): 
+                  join(" ", [model, config.model_server, accelerator, accelerator_config.accelerator_count])
+              ])
+            ])
+          ])
+        ])
+      ])
+    ])
+  )
   source = "../latency-profile"
 
   credentials_config                         = var.credentials_config
@@ -41,7 +78,20 @@ module "latency-profile" {
   templates_path                             = var.templates_path
   artifact_registry                          = var.artifact_registry
   build_latency_profile_generator_image      = false # Dont build image for each profile generator instance, only need to do once.
-  inference_server                           = var.inference_server
+  inference_server                           = {
+    deploy    = true
+    name      = split(" ", each.value)[1]
+    model     = split(" ", each.value)[0]
+    tokenizer = "google/gemma-7b"
+    service = {
+      name = "maxengine-server", # inference server service name
+      port = 8000
+    }
+    accelerator_config = {
+      type  = split(" ", each.value)[2]
+      count = split(" ", each.value)[3]
+    }
+}
   max_num_prompts                            = var.max_num_prompts
   max_output_len                             = var.max_output_len
   max_prompt_len                             = var.max_prompt_len
diff --git a/benchmarks/benchmark/tools/profile-generator/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
index b28cf8807..04890b277 100644
--- a/benchmarks/benchmark/tools/profile-generator/sample.tfvars
+++ b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
@@ -22,7 +22,6 @@ credentials_config = {
 
 project_id = "tpu-vm-gke-testing"
 
-
 # Latency profile generator service configuration
 artifact_registry                          = "us-central1-docker.pkg.dev/tpu-vm-gke-testing/ai-benchmark"
 build_latency_profile_generator_image      = false
@@ -30,17 +29,6 @@ latency_profile_kubernetes_service_account = "prom-frontend-sa"
 output_bucket                              = "tpu-vm-gke-testing-benchmark-output-bucket"
 k8s_hf_secret                              = "hf-token"
 
-# Inference server configuration
-inference_server = {
-  deploy    = false
-  name      = "jetstream"
-  tokenizer = "google/gemma-7b"
-  service = {
-    name = "maxengine-server", # inference server service name
-    port = 8000
-  }
-}
-
 # Benchmark configuration for Locust Docker accessing inference server
 request_rates = [5, 10, 15, 20]
 
@@ -64,14 +52,21 @@ profiles = {
   request_rates = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
 
   config = [{
-    model_server = "Jetstream"
+    model_server = "jetstream"
     model_server_configs = [{
       models = [
         "gemma2-2b",
         "gemma2-9b",
         "gemma2-27b"
       ]
-      model_configs = []
+      model_configs = [{
+        accelerators = [
+          "tpu-v5-lite-podslice",
+        ]
+        accelerator_configs = [{
+          accelerator_count = 1
+        }]
+      }]
     }]
     }, {
     model_server = "vllm"
diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf
index 7a4dcea7f..4ed2d3cf6 100644
--- a/benchmarks/benchmark/tools/profile-generator/variables.tf
+++ b/benchmarks/benchmark/tools/profile-generator/variables.tf
@@ -71,24 +71,6 @@ variable "build_latency_profile_generator_image" {
   default     = true
 }
 
-variable "inference_server" {
-  type = object({
-    deploy    = optional(bool), # Do you want this module to deploy the model server?
-    name      = string,
-    tokenizer = string, # Benchmark server configuration for tokenizer
-    service = object({
-      name = string,
-      port = number,
-    })
-  })
-  nullable = false
-
-  validation {
-    condition     = var.inference_server.name == "vllm" || var.inference_server.name == "tgi" || var.inference_server.name == "tensorrt_llm_triton" || var.inference_server.name == "sax" || var.inference_server.name == "jetstream"
-    error_message = "The inference_server_framework must be one of: vllm, tgi, tensorrt_llm_triton, sax, or jetstream."
-  }
-}
-
 variable "max_num_prompts" {
   description = "Benchmark server configuration for max number of prompts."
   type        = number

From b720fb334dea4aa5d3be4d7d357c9d846a5211a6 Mon Sep 17 00:00:00 2001
From: Ashok Chandrasekar <achandrasekar@google.com>
Date: Wed, 21 Aug 2024 04:24:56 +0000
Subject: [PATCH 19/37] Update table of contents

---
 benchmarks/benchmark/tools/latency-profile/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark/tools/latency-profile/README.md b/benchmarks/benchmark/tools/latency-profile/README.md
index a329894cd..7145cd23b 100644
--- a/benchmarks/benchmark/tools/latency-profile/README.md
+++ b/benchmarks/benchmark/tools/latency-profile/README.md
@@ -6,12 +6,12 @@
   * [Instructions](#instructions)
     * [Step 1: create output bucket](#step-1--create-output-bucket)
     * [Step 2: create and give service account access to write to output gcs bucket](#step-2--create-and-give-service-account-access-to-write-to-output-gcs-bucket)
-    * [Step 5: create artifact repository for automated Latency Profile Generator docker build](#step-5--create-artifact-repository-for-automated-latency-profile-generator-docker-build)
-    * [Step 6: create and configure terraform.tfvars](#step-6--create-and-configure-terraformtfvars)
+    * [Step 3: create artifact repository for automated Latency Profile Generator docker build](#step-3--create-artifact-repository-for-automated-latency-profile-generator-docker-build)
+    * [Step 4: create and configure terraform.tfvars](#step-4--create-and-configure-terraformtfvars)
       * [[optional] set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig)
       * [[optional] set up secret token in Secret Manager](#optional-set-up-secret-token-in-secret-manager)
-    * [Step 7: login to gcloud](#step-7--login-to-gcloud)
-    * [Step 8: terraform initialize, plan and apply](#step-8--terraform-initialize-plan-and-apply)
+    * [Step 5: login to gcloud](#step-5--login-to-gcloud)
+    * [Step 6: terraform initialize, plan and apply](#step-6--terraform-initialize-plan-and-apply)
   * [Inputs](#inputs)
 <!-- TOC -->
 

From 49f099bcf311c299c246452a0cd5afa0b698d7e3 Mon Sep 17 00:00:00 2001
From: Ashok Chandrasekar <achandrasekar@google.com>
Date: Wed, 21 Aug 2024 04:35:15 +0000
Subject: [PATCH 20/37] Fix lint issues

---
 benchmarks/benchmark/tools/latency-profile/variables.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
index 4f444c64e..43f2caf18 100644
--- a/benchmarks/benchmark/tools/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -83,7 +83,7 @@ variable "inference_server" {
       port = number,
     })
     accelerator_config = object({
-      type = string,
+      type  = string,
       count = number,
     })
   })

From d5062bc10b3da89b3b684632e4f25261eeaeae6e Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Wed, 21 Aug 2024 17:23:54 +0000
Subject: [PATCH 21/37] remove specific project id

---
 benchmarks/benchmark/tools/profile-generator/sample.tfvars | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
index 04890b277..2964eda5f 100644
--- a/benchmarks/benchmark/tools/profile-generator/sample.tfvars
+++ b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
@@ -23,7 +23,6 @@ credentials_config = {
 project_id = "tpu-vm-gke-testing"
 
 # Latency profile generator service configuration
-artifact_registry                          = "us-central1-docker.pkg.dev/tpu-vm-gke-testing/ai-benchmark"
 build_latency_profile_generator_image      = false
 latency_profile_kubernetes_service_account = "prom-frontend-sa"
 output_bucket                              = "tpu-vm-gke-testing-benchmark-output-bucket"

From 6b329bf77404c7a26e8997dc84eae1b077b2777f Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Wed, 21 Aug 2024 20:16:47 +0000
Subject: [PATCH 22/37] remove artifact_registry

---
 benchmarks/benchmark/tools/latency-profile/sample.tfvars | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/benchmark/tools/latency-profile/sample.tfvars b/benchmarks/benchmark/tools/latency-profile/sample.tfvars
index e5d45f3e3..5f50bc692 100644
--- a/benchmarks/benchmark/tools/latency-profile/sample.tfvars
+++ b/benchmarks/benchmark/tools/latency-profile/sample.tfvars
@@ -26,7 +26,6 @@ ksa       = "benchmark-ksa"
 k8s_hf_secret = "hf-token"
 
 # Latency profile generator service configuration
-artifact_registry                          = "us-central1-docker.pkg.dev/$PROJECT_ID/ai-benchmark"
 latency_profile_kubernetes_service_account = "sample-runner-ksa"
 output_bucket                              = "${PROJECT_ID}-benchmark-output"
 gcs_path                                   = "gs://${PROJECT_ID}-ai-gke-benchmark-fuse/ShareGPT_V3_unfiltered_cleaned_split_filtered_prompts.txt"

From 051b310a69b9649be65c955b6c5bf5fd57afa8ac Mon Sep 17 00:00:00 2001
From: Brendan Slabe <bslabe123@gmail.com>
Date: Thu, 22 Aug 2024 14:20:56 -0700
Subject: [PATCH 23/37] Stripped back LPG automation for separate PR

---
 .../benchmark/tools/latency-profile/main.tf   | 24 +-----
 .../latency-profile-generator.yaml.tpl        |  6 +-
 .../README.md                                 |  0
 .../benchmark/tools/profile-generator/main.tf | 35 ++------
 .../tools/profile-generator/sample.tfvars     | 82 ++-----------------
 .../tools/profile-generator/variables.tf      | 51 ++----------
 6 files changed, 25 insertions(+), 173 deletions(-)
 rename benchmarks/benchmark/tools/{latency-profile => profile-generator}/README.md (100%)

diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
index 7460e1229..6b505ce6d 100644
--- a/benchmarks/benchmark/tools/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -28,10 +28,11 @@ locals {
     ? null
     : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}"
   )
+  id = substr(uuid(), 0, 8)
 
   all_manifests = flatten([for manifest_file in local.templates :
     [for data in split("---", templatefile(manifest_file, {
-      combo                                      = format("%s-%s-%s-%s", var.inference_server.name, var.inference_server.model, var.inference_server.accelerator_config.type, var.inference_server.accelerator_config.count)
+      id                                         = local.id
       artifact_registry                          = var.artifact_registry
       namespace                                  = var.namespace
       inference_server_framework                 = var.inference_server.name
@@ -64,7 +65,6 @@ data "google_client_config" "identity" {
   count = var.credentials_config.fleet_host != null ? 1 : 0
 }
 
-
 resource "google_project_service" "cloudbuild" {
   count   = var.build_latency_profile_generator_image ? 1 : 0
   project = var.project_id
@@ -78,16 +78,6 @@ resource "google_project_service" "cloudbuild" {
   disable_on_destroy = false
 }
 
-resource "null_resource" "deploy_model_server" {
-  count = var.inference_server.deploy ? 1 : 0
-  provisioner "local-exec" {
-    command = "echo hello"
-  }
-  triggers = {
-    always_run = "${timestamp()}"
-  }
-}
-
 resource "kubernetes_manifest" "deploy_latency_profile_generator" {
   for_each   = toset(local.all_manifests)
   depends_on = [resource.null_resource.build_and_push_image, resource.null_resource.deploy_model_server]
@@ -95,14 +85,4 @@ resource "kubernetes_manifest" "deploy_latency_profile_generator" {
   timeouts {
     create = "30m"
   }
-}
-
-resource "null_resource" "cleanup_model_server" {
-  depends_on = [resource.kubernetes_manifest.deploy_latency_profile_generator]
-  provisioner "local-exec" {
-    command = "kubectl wait --for=condition=complete job/latency-profile-generator --timeout=-9600s && echo hello"
-  }
-  triggers = {
-    always_run = "${timestamp()}"
-  }
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
index 697f57911..2bf2fa39d 100644
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
+++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
@@ -1,10 +1,10 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: lpg-${combo}
+  name: latency-profile-generator-${id}
   namespace: ${namespace}
   labels:
-    name: latency-profile-generator
+    name: latency-profile-generator-${id}
 spec:
   template:
     spec:
@@ -49,5 +49,5 @@ spec:
                   key: HF_TOKEN
 %{ endfor ~}
       nodeSelector:
-        cloud.google.com/gke-accelerator: nvidia-l4   # nvidia-h100-80gb, nvidia-l4
+        cloud.google.com/gke-accelerator: nvidia-l4
         iam.gke.io/gke-metadata-server-enabled: "true"
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/README.md b/benchmarks/benchmark/tools/profile-generator/README.md
similarity index 100%
rename from benchmarks/benchmark/tools/latency-profile/README.md
rename to benchmarks/benchmark/tools/profile-generator/README.md
diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
index be2545e83..b4a1aefba 100644
--- a/benchmarks/benchmark/tools/profile-generator/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/main.tf
@@ -35,8 +35,6 @@ data "google_client_config" "identity" {
   count = var.credentials_config.fleet_host != null ? 1 : 0
 }
 
-
-
 resource "google_project_service" "cloudbuild" {
   count   = var.build_latency_profile_generator_image ? 1 : 0
   project = var.project_id
@@ -50,25 +48,10 @@ resource "google_project_service" "cloudbuild" {
   disable_on_destroy = false
 }
 
-# CREATE NODEPOOLS
+#  ----- Manual Benchmarking -----
 
 module "latency-profile" {
-  for_each = toset(
-    flatten([
-      for config in toset(var.profiles.config): toset([
-        for model_server_config in toset(config.model_server_configs): toset([
-          for model in toset(model_server_config.models): toset([
-            for model_config in toset(model_server_config.model_configs): toset([
-              for accelerator in toset(model_config.accelerators): toset([
-                for accelerator_config in toset(model_config.accelerator_configs): 
-                  join(" ", [model, config.model_server, accelerator, accelerator_config.accelerator_count])
-              ])
-            ])
-          ])
-        ])
-      ])
-    ])
-  )
+  count = var.targets.manual != null ? 1 : 0
   source = "../latency-profile"
 
   credentials_config                         = var.credentials_config
@@ -79,17 +62,11 @@ module "latency-profile" {
   artifact_registry                          = var.artifact_registry
   build_latency_profile_generator_image      = false # Dont build image for each profile generator instance, only need to do once.
   inference_server                           = {
-    deploy    = true
-    name      = split(" ", each.value)[1]
-    model     = split(" ", each.value)[0]
-    tokenizer = "google/gemma-7b"
+    name      = var.targets.manual.name
+    tokenizer = var.targets.manual.tokenizer
     service = {
-      name = "maxengine-server", # inference server service name
-      port = 8000
-    }
-    accelerator_config = {
-      type  = split(" ", each.value)[2]
-      count = split(" ", each.value)[3]
+      name = var.manual.targets.service_name
+      port = var.manual.targets.service_port
     }
 }
   max_num_prompts                            = var.max_num_prompts
diff --git a/benchmarks/benchmark/tools/profile-generator/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
index 2964eda5f..7872ff04b 100644
--- a/benchmarks/benchmark/tools/profile-generator/sample.tfvars
+++ b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
@@ -31,77 +31,11 @@ k8s_hf_secret                              = "hf-token"
 # Benchmark configuration for Locust Docker accessing inference server
 request_rates = [5, 10, 15, 20]
 
-profiles = {
-  valid_models = [
-    "gemma2-2b",
-    "gemma2-9b",
-    "gemma2-27b",
-    "llama3-8b",
-    "llama3-70b",
-    "llama3-405b"
-  ]
-  valid_accelerators = [
-    "tpu-v4-podslice",
-    "tpu-v5-lite-podslice",
-    "tpu-v5p-slice",
-    "nvidia-a100-80gb",
-    "nvidia-h100-80gb",
-    "nvidia-l4"
-  ]
-  request_rates = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
-
-  config = [{
-    model_server = "jetstream"
-    model_server_configs = [{
-      models = [
-        "gemma2-2b",
-        "gemma2-9b",
-        "gemma2-27b"
-      ]
-      model_configs = [{
-        accelerators = [
-          "tpu-v5-lite-podslice",
-        ]
-        accelerator_configs = [{
-          accelerator_count = 1
-        }]
-      }]
-    }]
-    }, {
-    model_server = "vllm"
-    model_server_configs = [{
-      models = [
-        "gemma2-2b",
-        "gemma2-9b",
-        "gemma2-27b",
-        "llama3-8b",
-        "llama3-70b",
-        "llama3-405b"
-      ]
-      model_configs = []
-    }]
-    }, {
-    model_server = "tgi"
-    model_server_configs = [{
-      models = [
-        "gemma2-2b",
-        "gemma2-9b",
-        "gemma2-27b",
-        "llama3-8b",
-        "llama3-70b",
-        "llama3-405b"
-      ]
-      model_configs = []
-    }]
-    }, {
-    model_server = "tensorrt-llm"
-    model_server_configs = [{
-      models = [
-        "llama3-8b",
-        "llama3-70b",
-        "llama3-405b"
-      ]
-      model_configs = []
-    }]
-  }]
-}
\ No newline at end of file
+targets = {
+  manual = {
+    name = 'your-model-server-name'
+    service_name = 'your-model-server-service-name'
+    service_port = 'your-model-service-service-port'
+    tokenizer = 'your-tokenizer'
+  }
+}
diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf
index 4ed2d3cf6..bc67f1e2c 100644
--- a/benchmarks/benchmark/tools/profile-generator/variables.tf
+++ b/benchmarks/benchmark/tools/profile-generator/variables.tf
@@ -141,53 +141,14 @@ variable "hugging_face_secret_version" {
   default     = null
 }
 
-variable "profiles" {
+variable "targets" {
   description = "Model servers to benchmark"
   type = object({
-    valid_models       = list(string)
-    valid_accelerators = list(string)
-    request_rates      = list(number)
-
-    config = list(object({
-      model_server = string # Model server name
-      model_server_configs = list(object({
-        models = list(string) # model name
-        model_configs = list(object({
-          accelerators = list(string) # Accelerator name
-          accelerator_configs = list(object({
-            accelerator_count = number # Number of accelerators
-          }))
-        }))
-      }))
+    manual = optional(object({
+      name = string
+      service_name = string
+      service_port = number
+      tokenizer = string
     }))
   })
-
-  validation {
-    condition = alltrue([
-      for cfg in var.profiles.config : alltrue([
-        for model_server_config in cfg.model_server_configs : (
-          alltrue([
-            for model_config in model_server_config.model_configs :
-            alltrue([for accelerator in model_config.accelerators :
-            contains(var.profiles.valid_accelerators, accelerator)])
-          ])
-        )
-      ])
-    ])
-    error_message = "Each accelerator must be in the valid_accelerators list."
-  }
-
-  validation {
-    condition = alltrue([
-      for cfg in var.profiles.config : alltrue([
-        for model_server_config in cfg.model_server_configs : (
-          alltrue([
-            for model in model_server_config.models :
-            contains(var.profiles.valid_models, model)
-          ])
-        )
-      ])
-    ])
-    error_message = "Each model must be in the valid_models list."
-  }
 }
\ No newline at end of file

From 3353eab78f16446fa174db82e1c82d3552443c49 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <bslabe123@gmail.com>
Date: Thu, 22 Aug 2024 14:28:18 -0700
Subject: [PATCH 24/37] typo

---
 benchmarks/benchmark/tools/profile-generator/variables.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf
index bc67f1e2c..c9b231277 100644
--- a/benchmarks/benchmark/tools/profile-generator/variables.tf
+++ b/benchmarks/benchmark/tools/profile-generator/variables.tf
@@ -142,7 +142,7 @@ variable "hugging_face_secret_version" {
 }
 
 variable "targets" {
-  description = "Model servers to benchmark"
+  description = "Model server to benchmark"
   type = object({
     manual = optional(object({
       name = string

From 6f7e36444ff762614ef7fbe27431ae17ad8cf8d7 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <bslabe123@gmail.com>
Date: Thu, 22 Aug 2024 14:35:25 -0700
Subject: [PATCH 25/37] nit

---
 benchmarks/benchmark/tools/profile-generator/main.tf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
index b4a1aefba..b04e04ce7 100644
--- a/benchmarks/benchmark/tools/profile-generator/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/main.tf
@@ -50,7 +50,7 @@ resource "google_project_service" "cloudbuild" {
 
 #  ----- Manual Benchmarking -----
 
-module "latency-profile" {
+module "profile-generator" {
   count = var.targets.manual != null ? 1 : 0
   source = "../latency-profile"
 
@@ -65,8 +65,8 @@ module "latency-profile" {
     name      = var.targets.manual.name
     tokenizer = var.targets.manual.tokenizer
     service = {
-      name = var.manual.targets.service_name
-      port = var.manual.targets.service_port
+      name = var.targets.manual.targets.service_name
+      port = var.targets.manual.targets.service_port
     }
 }
   max_num_prompts                            = var.max_num_prompts

From f73d4a6d4923015ff7db29e02f965ff33d0aaddb Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Thu, 22 Aug 2024 23:05:12 +0000
Subject: [PATCH 26/37] remove bad depends_on

---
 benchmarks/benchmark/tools/latency-profile/main.tf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
index 6b505ce6d..94535189f 100644
--- a/benchmarks/benchmark/tools/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -80,7 +80,6 @@ resource "google_project_service" "cloudbuild" {
 
 resource "kubernetes_manifest" "deploy_latency_profile_generator" {
   for_each   = toset(local.all_manifests)
-  depends_on = [resource.null_resource.build_and_push_image, resource.null_resource.deploy_model_server]
   manifest   = yamldecode(each.value)
   timeouts {
     create = "30m"

From e17f4c1682c02dc4dfd8d1821399a6ddb097e505 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Thu, 22 Aug 2024 23:10:56 +0000
Subject: [PATCH 27/37] nits

---
 benchmarks/benchmark/tools/profile-generator/main.tf      | 4 ++--
 benchmarks/benchmark/tools/profile-generator/variables.tf | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
index b04e04ce7..809ad6884 100644
--- a/benchmarks/benchmark/tools/profile-generator/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/main.tf
@@ -65,8 +65,8 @@ module "profile-generator" {
     name      = var.targets.manual.name
     tokenizer = var.targets.manual.tokenizer
     service = {
-      name = var.targets.manual.targets.service_name
-      port = var.targets.manual.targets.service_port
+      name = var.targets.manual.service_name
+      port = var.targets.manual.service_port
     }
 }
   max_num_prompts                            = var.max_num_prompts
diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf
index c9b231277..94f448f80 100644
--- a/benchmarks/benchmark/tools/profile-generator/variables.tf
+++ b/benchmarks/benchmark/tools/profile-generator/variables.tf
@@ -144,11 +144,11 @@ variable "hugging_face_secret_version" {
 variable "targets" {
   description = "Model server to benchmark"
   type = object({
-    manual = optional(object({
+    manual = object({
       name = string
       service_name = string
       service_port = number
       tokenizer = string
-    }))
+    })
   })
 }
\ No newline at end of file

From 6a59803c4ad7a1e5a51d5432a294161b65cd6d3d Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Thu, 22 Aug 2024 23:12:31 +0000
Subject: [PATCH 28/37] nits

---
 benchmarks/benchmark/tools/latency-profile/variables.tf | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
index 43f2caf18..5bbd2c1e8 100644
--- a/benchmarks/benchmark/tools/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -76,16 +76,11 @@ variable "inference_server" {
   type = object({
     deploy    = optional(bool), # Do you want this module to deploy the model server?
     name      = string,
-    model     = string,
     tokenizer = string,
     service = object({
       name = string,
       port = number,
     })
-    accelerator_config = object({
-      type  = string,
-      count = number,
-    })
   })
   nullable = false
 

From 99907c6db4997b4349d4bac99dac3e7e5f2229b3 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <bslabe123@gmail.com>
Date: Fri, 23 Aug 2024 22:37:42 +0800
Subject: [PATCH 29/37] Update main.tf

---
 benchmarks/benchmark/tools/locust-load-inference/main.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark/tools/locust-load-inference/main.tf b/benchmarks/benchmark/tools/locust-load-inference/main.tf
index 6d5fcf65a..ff557f9ee 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/main.tf
+++ b/benchmarks/benchmark/tools/locust-load-inference/main.tf
@@ -34,7 +34,7 @@ locals {
     [for data in split("---", templatefile(manifest_file, {
       artifact_registry              = var.artifact_registry
       namespace                      = var.namespace
-      inference_server_service       = var.inference_server.service
+      inference_server_service       = var.inference_server_service
       inference_server_framework     = var.inference_server_framework
       best_of                        = var.best_of
       gcs_path                       = var.gcs_path

From b54c7a96102b7b247d1936fbc831d7e9837703f1 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <bslabe123@gmail.com>
Date: Fri, 23 Aug 2024 23:01:52 +0800
Subject: [PATCH 30/37] Update README.md

---
 .../tools/profile-generator/README.md         | 48 +++++--------------
 1 file changed, 13 insertions(+), 35 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/README.md b/benchmarks/benchmark/tools/profile-generator/README.md
index 7145cd23b..f7df0c738 100644
--- a/benchmarks/benchmark/tools/profile-generator/README.md
+++ b/benchmarks/benchmark/tools/profile-generator/README.md
@@ -10,7 +10,6 @@
     * [Step 4: create and configure terraform.tfvars](#step-4--create-and-configure-terraformtfvars)
       * [[optional] set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig)
       * [[optional] set up secret token in Secret Manager](#optional-set-up-secret-token-in-secret-manager)
-    * [Step 5: login to gcloud](#step-5--login-to-gcloud)
     * [Step 6: terraform initialize, plan and apply](#step-6--terraform-initialize-plan-and-apply)
   * [Inputs](#inputs)
 <!-- TOC -->
@@ -41,8 +40,13 @@ Set the `output_bucket` in your `terraform.tfvars` to this gcs bucket.
 
 The Latency profile generator requires storage.admin access to write output to
 the given output gcs bucket. If you followed steps in `../../infra`, then you
-already have a kubernetes and gcloud service account created that has the proper
-access to the created output bucket.
+already be logged into gcloud have a kubernetes and gcloud service account 
+created that has the proper access to the created output bucket. If you are
+not logged into gcloud, run the following:
+
+```bash
+gcloud auth application-default login
+```
 
 To give viewer permissions on the gcs bucket to the gcloud service account,
 run the following:
@@ -54,12 +58,13 @@ gcloud storage buckets add-iam-policy-binding  gs://$OUTPUT_BUCKET/
 
 Your kubernetes service account will inherit the reader permissions.
 
-You will set the `lantency_profile_kubernetes_service_account` in your
+You will set the `latency_profile_kubernetes_service_account` in your
 `terraform.tfvars` to the kubernetes service account name.
 
 ### Step 3: create artifact repository for automated Latency Profile Generator docker build
 
-The latency profile generator rebuilds the docker file on each terraform apply.
+The latency profile generator rebuilds the docker file on each terraform apply 
+if `build_latency_profile_generator_image` is set to true (default is true).
 The containers will be pushed to the given `artifact_registry`. This artifact
 repository is expected to already exist. If you created your cluster via
 `../../infra/`, then an artifact repository was created for you with the same
@@ -70,7 +75,6 @@ own via this command:
 gcloud artifacts repositories create ai-benchmark --location=us-central1 --repository-format=docker
 ```
 
-
 ### Step 4: create and configure terraform.tfvars
 
 Create a `terraform.tfvars` file. `./sample-tfvars` is provided as an example
@@ -86,11 +90,11 @@ Fill out your `terraform.tfvars` with the desired model and server configuration
 - `credentials_config` - credentials for cluster to deploy Latency Profile Generator benchmark tool on
 - `project_id` - project id for enabling dependent services for building Latency Profile Generator artifacts
 - `artifact_registry` - artifact registry to upload Latency Profile Generator artifacts to
-- `inference_server_service` - an accessible service name for inference workload to be benchmarked **(Note: If you are using a non-80 port for your model server service, it should be specified here. Example: `my-service-name:9000`)**
-- `tokenizer` - must match the model running on the inference workload to be benchmarked
-- `inference_server_framework` - the inference workload framework
+- `build_latency_profile_generator_image` - Whether latency profile generator image will be built or not
+- `targets` - Which model servers are we targeting for benchmarking? Set `manual` if intending to benchmark a model server already in the cluster.
 - `output_bucket` - gcs bucket to write benchmarking metrics to.
 - `latency_profile_kubernetes_service_account` - service account giving access to latency profile generator to write to `output_bucket`
+- `k8s_hf_secret` - Name of secret for huggingface token stored in k8s
 
 #### [optional] set-up credentials config with kubeconfig
 
@@ -171,29 +175,3 @@ terraform apply
 
 A results file will appear in GCS bucket specified as `output_bucket` in input
 variables.
-
-<!-- BEGIN_TF_DOCS -->
-
-## Inputs
-
-| Name | Description | Type | Default | Required |
-|------|-------------|------|---------|:--------:|
-| <a name="input_artifact_registry"></a> [artifact\_registry](#input\_artifact\_registry) | Artifact registry for storing Latency Profile Generator container. | `string` | `null` | no |
-| <a name="input_credentials_config"></a> [credentials\_config](#input\_credentials\_config) | Configure how Terraform authenticates to the cluster. | <pre>object({<br>    fleet_host = optional(string)<br>    kubeconfig = optional(object({<br>      context = optional(string)<br>      path    = optional(string, "~/.kube/config")<br>    }))<br>  })</pre> | n/a | yes |
-| <a name="input_hugging_face_secret"></a> [hugging\_face\_secret](#input\_hugging\_face\_secret) | name of the kubectl huggingface secret token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/ | `string` | `null` | no |
-| <a name="input_hugging_face_secret_version"></a> [hugging\_face\_secret\_version](#input\_hugging\_face\_secret\_version) | Secret version in Secret Manager | `string` | `null` | no |
-| <a name="input_inference_server_framework"></a> [inference\_server\_framework](#input\_inference\_server\_framework) | Benchmark server configuration for inference server framework. Can be one of: vllm, tgi, tensorrt\_llm\_triton, sax | `string` | `"tgi"` | no |
-| <a name="input_inference_server_service"></a> [inference\_server\_service](#input\_inference\_server\_service) | Inference server service | `string` | n/a | yes |
-| <a name="input_k8s_hf_secret"></a> [k8s\_hf\_secret](#input\_k8s\_hf\_secret) | Name of secret for huggingface token; stored in k8s | `string` | `null` | no |
-| <a name="input_ksa"></a> [ksa](#input\_ksa) | Kubernetes Service Account used for workload. | `string` | `"default"` | no |
-| <a name="input_latency_profile_kubernetes_service_account"></a> [latency\_profile\_kubernetes\_service\_account](#input\_latency\_profile\_kubernetes\_service\_account) | Kubernetes Service Account to be used for the latency profile generator tool | `string` | `"sample-runner-ksa"` | no |
-| <a name="input_max_num_prompts"></a> [max\_num\_prompts](#input\_max\_num\_prompts) | Benchmark server configuration for max number of prompts. | `number` | `1000` | no |
-| <a name="input_max_output_len"></a> [max\_output\_len](#input\_max\_output\_len) | Benchmark server configuration for max output length. | `number` | `256` | no |
-| <a name="input_max_prompt_len"></a> [max\_prompt\_len](#input\_max\_prompt\_len) | Benchmark server configuration for max prompt length. | `number` | `256` | no |
-| <a name="input_namespace"></a> [namespace](#input\_namespace) | Namespace used for model and benchmarking deployments. | `string` | `"default"` | no |
-| <a name="input_output_bucket"></a> [output\_bucket](#input\_output\_bucket) | Bucket name for storing results | `string` | n/a | yes |
-| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | Project id of existing or created project. | `string` | n/a | yes |
-| <a name="input_templates_path"></a> [templates\_path](#input\_templates\_path) | Path where manifest templates will be read from. Set to null to use the default manifests | `string` | `null` | no |
-| <a name="input_tokenizer"></a> [tokenizer](#input\_tokenizer) | Benchmark server configuration for tokenizer. | `string` | `"tiiuae/falcon-7b"` | no |
-
-<!-- END_TF_DOCS -->

From 2618042a36cdd3845d60129bddef1167335b1a03 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Fri, 23 Aug 2024 16:08:11 +0000
Subject: [PATCH 31/37] more cleanup

---
 .../benchmark/tools/latency-profile/main.tf   | 46 ++++++++-----------
 .../latency-profile-generator.yaml.tpl        |  4 +-
 .../tools/latency-profile/sample.tfvars       |  1 -
 .../tools/latency-profile/variables.tf        |  7 ---
 .../benchmark/tools/profile-generator/main.tf | 21 ++++-----
 .../tools/profile-generator/sample.tfvars     | 12 ++---
 .../tools/profile-generator/variables.tf      | 13 ++----
 7 files changed, 39 insertions(+), 65 deletions(-)

diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
index 94535189f..4a68d0cb2 100644
--- a/benchmarks/benchmark/tools/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -23,33 +23,12 @@ locals {
     ? "${path.module}/manifest-templates"
     : pathexpand(var.templates_path)
   )
+  latency-profile-generator-template = "${path.module}/manifest-templates/latency-profile-generator.yaml.tpl"
   hugging_face_token_secret = (
     var.hugging_face_secret == null || var.hugging_face_secret_version == null
     ? null
     : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}"
   )
-  id = substr(uuid(), 0, 8)
-
-  all_manifests = flatten([for manifest_file in local.templates :
-    [for data in split("---", templatefile(manifest_file, {
-      id                                         = local.id
-      artifact_registry                          = var.artifact_registry
-      namespace                                  = var.namespace
-      inference_server_framework                 = var.inference_server.name
-      inference_server_service                   = var.inference_server.service.name
-      inference_server_service_port              = var.inference_server.service.port
-      tokenizer                                  = var.inference_server.tokenizer
-      ksa                                        = var.ksa
-      latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
-      max_num_prompts                            = var.max_num_prompts
-      max_output_len                             = var.max_output_len
-      max_prompt_len                             = var.max_prompt_len
-      request_rates                              = join(",", [for number in var.request_rates : tostring(number)])
-      hugging_face_token_secret_list             = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
-      k8s_hf_secret_list                         = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
-      output_bucket                              = var.output_bucket
-    })) : data]
-  ])
 }
 
 terraform {
@@ -78,10 +57,21 @@ resource "google_project_service" "cloudbuild" {
   disable_on_destroy = false
 }
 
-resource "kubernetes_manifest" "deploy_latency_profile_generator" {
-  for_each   = toset(local.all_manifests)
-  manifest   = yamldecode(each.value)
-  timeouts {
-    create = "30m"
-  }
+resource "kubernetes_manifest" "latency-profile-generator" {
+  manifest = yamldecode(templatefile(local.latency-profile-generator-template, {
+      namespace                                  = var.namespace
+      artifact_registry                          = var.artifact_registry
+      inference_server_framework                 = var.inference_server.name
+      inference_server_service                   = var.inference_server.service.name
+      inference_server_service_port              = var.inference_server.service.port
+      tokenizer                                  = var.inference_server.tokenizer
+      latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
+      max_num_prompts                            = var.max_num_prompts
+      max_output_len                             = var.max_output_len
+      max_prompt_len                             = var.max_prompt_len
+      request_rates                              = join(",", [for number in var.request_rates : tostring(number)])
+      hugging_face_token_secret_list             = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
+      k8s_hf_secret_list                         = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
+      output_bucket                              = var.output_bucket
+  }))
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
index 2bf2fa39d..ba75c3ed1 100644
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
+++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
@@ -1,10 +1,10 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: latency-profile-generator-${id}
+  name: latency-profile-generator
   namespace: ${namespace}
   labels:
-    name: latency-profile-generator-${id}
+    name: latency-profile-generator
 spec:
   template:
     spec:
diff --git a/benchmarks/benchmark/tools/latency-profile/sample.tfvars b/benchmarks/benchmark/tools/latency-profile/sample.tfvars
index 5f50bc692..9451b6755 100644
--- a/benchmarks/benchmark/tools/latency-profile/sample.tfvars
+++ b/benchmarks/benchmark/tools/latency-profile/sample.tfvars
@@ -21,7 +21,6 @@ credentials_config = {
 project_id = "$PROJECT_ID"
 
 namespace = "benchmark"
-ksa       = "benchmark-ksa"
 
 k8s_hf_secret = "hf-token"
 
diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
index 5bbd2c1e8..1c6969a37 100644
--- a/benchmarks/benchmark/tools/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -46,13 +46,6 @@ variable "project_id" {
   nullable    = false
 }
 
-variable "ksa" {
-  description = "Kubernetes Service Account used for workload."
-  type        = string
-  nullable    = false
-  default     = "default"
-}
-
 variable "templates_path" {
   description = "Path where manifest templates will be read from. Set to null to use the default manifests"
   type        = string
diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
index 809ad6884..ee3f3e19b 100644
--- a/benchmarks/benchmark/tools/profile-generator/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/main.tf
@@ -50,25 +50,24 @@ resource "google_project_service" "cloudbuild" {
 
 #  ----- Manual Benchmarking -----
 
-module "profile-generator" {
-  count = var.targets.manual != null ? 1 : 0
+module "latency-profile" {
+  count  = var.targets.manual != null ? 1 : 0
   source = "../latency-profile"
 
-  credentials_config                         = var.credentials_config
-  namespace                                  = var.namespace
-  project_id                                 = var.project_id
-  ksa                                        = var.ksa
-  templates_path                             = var.templates_path
-  artifact_registry                          = var.artifact_registry
-  build_latency_profile_generator_image      = false # Dont build image for each profile generator instance, only need to do once.
-  inference_server                           = {
+  credentials_config                    = var.credentials_config
+  namespace                             = var.namespace
+  project_id                            = var.project_id
+  templates_path                        = var.templates_path
+  artifact_registry                     = var.artifact_registry
+  build_latency_profile_generator_image = false # Dont build image for each profile generator instance, only need to do once.
+  inference_server = {
     name      = var.targets.manual.name
     tokenizer = var.targets.manual.tokenizer
     service = {
       name = var.targets.manual.service_name
       port = var.targets.manual.service_port
     }
-}
+  }
   max_num_prompts                            = var.max_num_prompts
   max_output_len                             = var.max_output_len
   max_prompt_len                             = var.max_prompt_len
diff --git a/benchmarks/benchmark/tools/profile-generator/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
index 7872ff04b..68ad613fb 100644
--- a/benchmarks/benchmark/tools/profile-generator/sample.tfvars
+++ b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
@@ -20,12 +20,12 @@ credentials_config = {
   }
 }
 
-project_id = "tpu-vm-gke-testing"
+project_id = "your_project_id"
 
 # Latency profile generator service configuration
 build_latency_profile_generator_image      = false
 latency_profile_kubernetes_service_account = "prom-frontend-sa"
-output_bucket                              = "tpu-vm-gke-testing-benchmark-output-bucket"
+output_bucket                              = "your_project_id-benchmark-output-bucket"
 k8s_hf_secret                              = "hf-token"
 
 # Benchmark configuration for Locust Docker accessing inference server
@@ -33,9 +33,9 @@ request_rates = [5, 10, 15, 20]
 
 targets = {
   manual = {
-    name = 'your-model-server-name'
-    service_name = 'your-model-server-service-name'
-    service_port = 'your-model-service-service-port'
-    tokenizer = 'your-tokenizer'
+    name         = "your_model_server_name"
+    service_name = "your_model_server_service_name"
+    service_port = "your_model_service_service_port"
+    tokenizer    = "your_tokenizer"
   }
 }
diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf
index 94f448f80..26dd77d85 100644
--- a/benchmarks/benchmark/tools/profile-generator/variables.tf
+++ b/benchmarks/benchmark/tools/profile-generator/variables.tf
@@ -46,13 +46,6 @@ variable "project_id" {
   nullable    = false
 }
 
-variable "ksa" {
-  description = "Kubernetes Service Account used for workload."
-  type        = string
-  nullable    = false
-  default     = "default"
-}
-
 variable "templates_path" {
   description = "Path where manifest templates will be read from. Set to null to use the default manifests"
   type        = string
@@ -142,13 +135,13 @@ variable "hugging_face_secret_version" {
 }
 
 variable "targets" {
-  description = "Model server to benchmark"
+  description = "Model server(s) targeted for benchmarking, use 'manual' for already installed model servers"
   type = object({
     manual = object({
-      name = string
+      name         = string
       service_name = string
       service_port = number
-      tokenizer = string
+      tokenizer    = string
     })
   })
 }
\ No newline at end of file

From 7267d3076469aa2ca9d5fbccdea3d7b5157f0190 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Fri, 23 Aug 2024 16:17:46 +0000
Subject: [PATCH 32/37] move latency-profile module to subdirectory

---
 .../benchmark/tools/latency-profile/build.tf  |   8 -
 .../latency-profile/container/Dockerfile      |  21 -
 .../container/benchmark_serving.py            | 469 ------------------
 .../container/latency_throughput_curve.sh     |  28 --
 .../container/requirements.txt                |  37 --
 .../benchmark/tools/profile-generator/main.tf |  13 +-
 .../modules}/latency-profile/main.tf          |  13 -
 .../latency-profile-generator.yaml.tpl        |   0
 .../modules}/latency-profile/sample.tfvars    |   0
 .../modules}/latency-profile/variables.tf     |   6 -
 10 files changed, 6 insertions(+), 589 deletions(-)
 delete mode 100644 benchmarks/benchmark/tools/latency-profile/build.tf
 delete mode 100644 benchmarks/benchmark/tools/latency-profile/container/Dockerfile
 delete mode 100644 benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py
 delete mode 100755 benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
 delete mode 100644 benchmarks/benchmark/tools/latency-profile/container/requirements.txt
 rename benchmarks/benchmark/tools/{ => profile-generator/modules}/latency-profile/main.tf (91%)
 rename benchmarks/benchmark/tools/{ => profile-generator/modules}/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl (100%)
 rename benchmarks/benchmark/tools/{ => profile-generator/modules}/latency-profile/sample.tfvars (100%)
 rename benchmarks/benchmark/tools/{ => profile-generator/modules}/latency-profile/variables.tf (96%)

diff --git a/benchmarks/benchmark/tools/latency-profile/build.tf b/benchmarks/benchmark/tools/latency-profile/build.tf
deleted file mode 100644
index 2f5e17cd3..000000000
--- a/benchmarks/benchmark/tools/latency-profile/build.tf
+++ /dev/null
@@ -1,8 +0,0 @@
-resource "null_resource" "build_and_push_image" {
-  count      = var.build_latency_profile_generator_image ? 1 : 0
-  depends_on = [resource.google_project_service.cloudbuild]
-  provisioner "local-exec" {
-    working_dir = path.module
-    command     = "gcloud builds submit --tag ${var.artifact_registry}/latency-profile:latest container"
-  }
-}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/container/Dockerfile b/benchmarks/benchmark/tools/latency-profile/container/Dockerfile
deleted file mode 100644
index a133294a8..000000000
--- a/benchmarks/benchmark/tools/latency-profile/container/Dockerfile
+++ /dev/null
@@ -1,21 +0,0 @@
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
-
-RUN apt-get update -y \
-    && apt-get install -y python3-pip git vim curl wget
-RUN pip3 install --upgrade pip
-RUN pip install packaging torch transformers
-WORKDIR /workspace
-
-# install build and runtime dependencies
-COPY requirements.txt requirements.txt
-RUN pip install -r requirements.txt
-
-RUN pip install -U "huggingface_hub[cli]"
-
-RUN wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-COPY benchmark_serving.py benchmark_serving.py
-COPY latency_throughput_curve.sh latency_throughput_curve.sh
-
-RUN chmod +x latency_throughput_curve.sh
-RUN chmod +x benchmark_serving.py
diff --git a/benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py b/benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py
deleted file mode 100644
index 842b0af39..000000000
--- a/benchmarks/benchmark/tools/latency-profile/container/benchmark_serving.py
+++ /dev/null
@@ -1,469 +0,0 @@
-r"""Benchmark LLM serving throughput and latency.
-
-This script is for sending requests with prompts to LLM server and benchmark
-the latency and throughput at various request rates. It is a modified version of
-https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_serving.py.
-It currently supports TGI, vLLM, Triton TensorRT-LLM and Saxml.
-"""
-
-import argparse
-import asyncio
-import json
-import random
-import time
-from typing import AsyncGenerator, List, Tuple
-
-import aiohttp
-import numpy as np
-from transformers import AutoTokenizer
-from transformers import PreTrainedTokenizerBase
-
-
-# (prompt len, output len, latency)
-REQUEST_LATENCY: List[Tuple[int, int, float]] = []
-
-MIN_SEQ_LEN = 4
-CLIENT_TIMEOUT_SEC = 3 * 60 * 60
-NEW_TEXT_KEY = "\nOutput:\n"
-
-
-def sample_requests(
-    dataset_path: str,
-    num_requests: int,
-    max_input_len: int,
-    max_output_len: int,
-    tokenizer: PreTrainedTokenizerBase,
-    use_dummy_text: bool,
-) -> List[Tuple[str, int, int]]:
-  """Samples requests from the dataset or creates dummy requests."""
-  if use_dummy_text:
-    dummy_prompt_token_ids = [0] * max_input_len
-    dummy_prompt = tokenizer.decode(dummy_prompt_token_ids)
-    dummy_requests = [(
-        dummy_prompt,
-        max_input_len,
-        max_output_len,
-    )] * num_requests
-    return dummy_requests
-
-  # Load the dataset.
-  with open(dataset_path) as f:
-    dataset = json.load(f)
-  # Filter out the conversations with less than 2 turns.
-  dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-  # Only keep the first two turns of each conversation.
-  dataset = [
-      (data["conversations"][0]["value"], data["conversations"][1]["value"])
-      for data in dataset
-  ]
-
-  # Tokenize the prompts and completions.
-  prompts = [prompt for prompt, _ in dataset]
-  prompt_token_ids = tokenizer(prompts).input_ids
-  completions = [completion for _, completion in dataset]
-  completion_token_ids = tokenizer(completions).input_ids
-  tokenized_dataset = []
-  for i in range(len(dataset)):
-    output_len = len(completion_token_ids[i])
-    tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
-
-  # Filter out too long sequences.
-  filtered_dataset: List[Tuple[str, int, int]] = []
-  for prompt, prompt_token_ids, output_len in tokenized_dataset:
-    prompt_len = len(prompt_token_ids)
-    if prompt_len < MIN_SEQ_LEN or output_len < MIN_SEQ_LEN:
-      # Prune too short sequences.
-      # This is because TGI causes errors when the input or output length
-      # is too short.
-      continue
-    if prompt_len > max_input_len or output_len > max_output_len:
-      # Prune too long sequences.
-      continue
-    filtered_dataset.append((prompt, prompt_len, output_len))
-
-  # Sample the requests.
-  sampled_requests = random.sample(filtered_dataset, num_requests)
-  return sampled_requests
-
-
-async def get_request(
-    input_requests: List[Tuple[str, int, int]],
-    request_rate: float,
-) -> AsyncGenerator[Tuple[str, int, int], None]:
-  """Gets request async."""
-  input_requests = iter(input_requests)
-  for request in input_requests:
-    yield request
-
-    if request_rate == float("inf"):
-      # If the request rate is infinity, then we don't need to wait.
-      continue
-    # Sample the request interval from the exponential distribution.
-    interval = np.random.exponential(1.0 / request_rate)
-    # The next request will be sent after the interval.
-    await asyncio.sleep(interval)
-
-
-async def send_request(
-    backend: str,
-    api_url: str,
-    prompt: str,
-    prompt_len: int,
-    output_len: int,
-    best_of: int,
-    use_beam_search: bool,
-    top_k: int,
-    tokenizer: PreTrainedTokenizerBase,
-    sax_model: str,
-) -> None:
-  """Sends request to server."""
-  request_start_time = time.time()
-
-  headers = {"User-Agent": "Benchmark Client"}
-  if backend == "vllm":
-    pload = {
-        "prompt": prompt,
-        "n": 1,
-        "best_of": best_of,
-        "use_beam_search": use_beam_search,
-        "temperature": 0.0 if use_beam_search else 1.0,
-        "top_p": 1.0,
-        "max_tokens": output_len,
-        "ignore_eos": False,
-        "stream": False,
-    }
-  elif backend == "tgi":
-    assert not use_beam_search
-    params = {
-        "best_of": best_of,
-        "max_new_tokens": output_len,
-        "do_sample": True,
-    }
-    pload = {
-        "inputs": prompt,
-        "parameters": params,
-    }
-  elif backend == "naive_transformers":
-    # If max_length or top_k is not specified _MAX_LENGTH_DEFAULT = 200 and
-    # _TOP_K_DEFAULT = 10 in peft/handler.py will be used.
-    pload = {
-        "instances": [{
-            "prompt": prompt,
-            "max_length": output_len,
-            "top_k": top_k,
-        }]
-    }
-  elif backend == "tensorrt_llm_triton":
-    pload = {
-        "text_input": prompt,
-        "max_tokens": output_len,
-        "beam_width": 1 if not use_beam_search else best_of,
-        "temperature": 0.0 if use_beam_search else 1.0,
-        "top_p": 1.0,
-        "bad_words": "",
-        "stop_words": "",
-        "stream": False,
-    }
-  elif backend == "sax":
-    pload = {
-        "model": sax_model,
-        "prompt": prompt,
-        "n": 1,
-        "best_of": best_of,
-        "use_beam_search": use_beam_search,
-        "temperature": 0.0 if use_beam_search else 1.0,
-        "top_p": 1.0,
-        "top_k": 50,
-        "max_tokens": output_len,
-        "stream": False,
-    }
-  elif backend == "jetstream":
-    pload = {
-        "prompt": prompt,
-        "max_tokens": 1,
-    }
-  else:
-    raise ValueError(f"Unknown backend: {backend}")
-
-  # Set client timeout to be 3 hrs.
-  timeout = aiohttp.ClientTimeout(total=CLIENT_TIMEOUT_SEC)
-  async with aiohttp.ClientSession(timeout=timeout) as session:
-    while True:
-      async with session.post(api_url, headers=headers, json=pload) as response:
-        chunks = []
-        async for chunk, _ in response.content.iter_chunks():
-          chunks.append(chunk)
-      output = b"".join(chunks).decode("utf-8")
-      output = json.loads(output)
-
-      # Re-send the request if it failed.
-      if "error" not in output:
-        break
-
-  request_end_time = time.time()
-  # Naive HF transformers generation and TensorRT-LLM generation stops at EOS
-  # tokens and the generation may be shorter than the ground-truth output
-  # sequence length.
-  if backend == "naive_transformers":
-    complete_pred = output["predictions"][0][0]["generated_text"]
-    new_text_start_index = complete_pred.find(NEW_TEXT_KEY) + len(NEW_TEXT_KEY)
-    pred = complete_pred[new_text_start_index:]
-    output_token_ids = tokenizer(pred).input_ids
-    output_len = len(output_token_ids) - prompt_len
-  elif backend == "tensorrt_llm_triton":
-    output_token_ids = tokenizer(output["text_output"]).input_ids
-    output_len = len(output_token_ids)
-  elif backend == "sax":
-    output_token_ids = tokenizer(output["choices"][0]["text"]).input_ids
-    output_len = len(output_token_ids)
-  elif backend == "tgi":
-    output_token_ids = tokenizer(output["generated_text"]).input_ids
-    output_len = len(output_token_ids)
-  elif backend == "vllm":
-    total_token_ids = tokenizer(output["text"][0]).input_ids
-    new_total_len = len(total_token_ids)
-    output_len = new_total_len - prompt_len
-  elif backend == "jetstream":
-    output_token_ids = tokenizer(output["response"]).input_ids
-    output_len = len(output_token_ids)
-
-  request_latency = request_end_time - request_start_time
-  REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
-
-
-async def benchmark(
-    backend: str,
-    api_url: str,
-    input_requests: List[Tuple[str, int, int]],
-    best_of: int,
-    use_beam_search: bool,
-    request_rate: float,
-    top_k: int,
-    tokenizer: PreTrainedTokenizerBase,
-    sax_model: str,
-) -> None:
-  """Runs benchmark with asynchronous requests."""
-  tasks: List[asyncio.Task] = []
-  async for request in get_request(input_requests, request_rate):
-    prompt, prompt_len, output_len = request
-    task = asyncio.create_task(
-        send_request(
-            backend,
-            api_url,
-            prompt,
-            prompt_len,
-            output_len,
-            best_of,
-            use_beam_search,
-            top_k,
-            tokenizer,
-            sax_model,
-        )
-    )
-    tasks.append(task)
-  await asyncio.gather(*tasks)
-
-
-def main(args: argparse.Namespace):
-  print(args)
-  random.seed(args.seed)
-  np.random.seed(args.seed)
-
-  api_url = f"http://{args.host}:{args.port}/{args.endpoint}"
-  tokenizer = AutoTokenizer.from_pretrained(
-      args.tokenizer, trust_remote_code=args.trust_remote_code
-  )
-  input_requests = sample_requests(
-      args.dataset,
-      args.num_prompts,
-      args.max_input_length,
-      args.max_output_length,
-      tokenizer,
-      args.use_dummy_text,
-  )
-
-  benchmark_start_time = time.time()
-  asyncio.run(
-      benchmark(
-          args.backend,
-          api_url,
-          input_requests,
-          args.best_of,
-          args.use_beam_search,
-          args.request_rate,
-          args.top_k,
-          tokenizer,
-          args.sax_model,
-      )
-  )
-  benchmark_end_time = time.time()
-  benchmark_time = benchmark_end_time - benchmark_start_time
-  print(f"Total time: {benchmark_time:.2f} s")
-  print(f"Requests/min: {60 * args.num_prompts / benchmark_time:.2f}")
-
-  total_output_tokens = np.sum([output_len for _, output_len, _ in
-                                REQUEST_LATENCY])
-  output_tokens_per_min = 60 * total_output_tokens / benchmark_time
-  print(f"Output_tokens/min: {output_tokens_per_min:.2f}")
-
-  total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in
-                               REQUEST_LATENCY])
-  input_tokens_per_min = 60 * total_input_tokens / benchmark_time
-  print(f"Input_tokens/min: {input_tokens_per_min:.2f}")
-
-  total_tokens = total_input_tokens + total_output_tokens
-  tokens_per_min = 60 * total_tokens / benchmark_time
-  print(f"Tokens/min: {tokens_per_min:.2f}")
-
-  if args.machine_cost:
-    print(
-        "Cost $/1k tokens:"
-        f" {args.machine_cost * 1000 / (60 * output_tokens_per_min)}"
-    )
-  # NOTE: The latency below includes requests awaiting time on server side.
-  # It's not comparable with the model inference latency for batch size 1.
-  avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY])
-  print(
-      "Average seconds/request (includes waiting time on server):"
-      f" {avg_latency:.2f}"
-  )
-
-  avg_per_token_latency = np.mean([
-      latency / (prompt_len + output_len)
-      for prompt_len, output_len, latency in REQUEST_LATENCY
-  ])
-  print(
-      "Average milliseconds/token (includes waiting time on server):"
-      f" {1000 * avg_per_token_latency:.2f}"
-  )
-
-  avg_per_output_token_latency = np.mean(
-      [latency / output_len for _, output_len, latency in REQUEST_LATENCY]
-  )
-  print(
-      "Average milliseconds/output_token (includes waiting time on server):"
-      f" {1000 * avg_per_output_token_latency:.2f}"
-  )
-
-  avg_input_len = np.mean(
-      [prompt_len for prompt_len, _, _ in REQUEST_LATENCY]
-  )
-  print(
-      "Average input length:"
-      f" {avg_input_len:.2f}"
-  )
-
-  avg_output_len = np.mean(
-      [output_len for _, output_len, _ in REQUEST_LATENCY]
-  )
-  print(
-      "Average output length:"
-      f" {avg_output_len:.2f}"
-  )
-
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser(
-      description="Benchmark the online serving throughput."
-  )
-  parser.add_argument(
-      "--backend",
-      type=str,
-      default="vllm",
-      choices=[
-          "vllm",
-          "tgi",
-          "naive_transformers",
-          "tensorrt_llm_triton",
-          "sax",
-          "jetstream"
-      ],
-  )
-  parser.add_argument(
-      "--sax_model",
-      type=str,
-      default="",
-      help="Model name to send request to at API server for SAX model server.",
-  )
-  parser.add_argument("--endpoint", type=str, default="generate")
-  parser.add_argument("--host", type=str, default="localhost")
-  parser.add_argument("--port", type=int, default=7080)
-  parser.add_argument("--dataset", type=str, help="Path to the dataset.")
-  parser.add_argument(
-      "--tokenizer",
-      type=str,
-      required=True,
-      help="Name or path of the tokenizer.",
-  )
-  parser.add_argument(
-      "--best-of",
-      type=int,
-      default=1,
-      help="Generates `best_of` sequences per prompt and returns the best one.",
-  )
-  parser.add_argument("--use-beam-search", action="store_true")
-  parser.add_argument(
-      "--num-prompts",
-      type=int,
-      default=1000,
-      help="Number of prompts to process.",
-  )
-  parser.add_argument(
-      "--max-input-length",
-      type=int,
-      default=1024,
-      help=(
-          "Maximum number of input tokens for filtering the benchmark dataset."
-      ),
-  )
-  parser.add_argument(
-      "--max-output-length",
-      type=int,
-      default=1024,
-      help=(
-          "Maximum number of input tokens for filtering the benchmark dataset."
-      ),
-  )
-  parser.add_argument(
-      "--top-k",
-      type=int,
-      default=32000,
-      help=(
-          "Number of candidate tokens that are considered at each step of the"
-          " generation process. 32000 is the vocab_size of Open-LLaMA and"
-          " LLaMA2 models."
-      ),
-  )
-  parser.add_argument(
-      "--request-rate",
-      type=float,
-      default=float("inf"),
-      help=(
-          "Number of requests per second. If this is inf, "
-          "then all the requests are sent at time 0. "
-          "Otherwise, we use Poisson process to synthesize "
-          "the request arrival times."
-      ),
-  )
-  parser.add_argument("--seed", type=int, default=0)
-  parser.add_argument(
-      "--trust-remote-code",
-      action="store_true",
-      help="trust remote code from huggingface",
-  )
-  parser.add_argument(
-      "--machine-cost",
-      type=float,
-      default=None,
-      help="Machine cost per hour including accelerators (if any)",
-  )
-  parser.add_argument(
-      "--use-dummy-text",
-      action="store_true",
-      help=(
-          "Whether to use dummy text with length defined by max_input_length"
-          " and max_output_length."
-      ),
-  )
-  cmd_args = parser.parse_args()
-  main(cmd_args)
diff --git a/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
deleted file mode 100755
index ec5a838f0..000000000
--- a/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-# Copyright 2024 Google Inc. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -o xtrace
-
-export IP=$IP
-
-huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential
-
-for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do
-  # TODO: Check if profile already exists, if so then skip
-  timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
-  output_file="latency-profile-${timestamp}.txt"
-  python3 benchmark_serving.py   --host="$IP"   --port="$PORT"   --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
-done
-
diff --git a/benchmarks/benchmark/tools/latency-profile/container/requirements.txt b/benchmarks/benchmark/tools/latency-profile/container/requirements.txt
deleted file mode 100644
index 4d1d37e18..000000000
--- a/benchmarks/benchmark/tools/latency-profile/container/requirements.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-# formatting
-yapf==0.32.0
-toml==0.10.2
-ruff==0.1.5
-
-# type checking
-mypy==0.991
-types-PyYAML
-types-requests
-types-setuptools
-
-# testing
-pytest
-pytest-forked
-pytest-asyncio
-httpx
-einops # required for MPT
-flash_attn # required for HuggingFace's llama implementation
-openai
-requests
-
-# run
-ninja  # For faster builds.
-psutil
-ray >= 2.9
-sentencepiece  # Required for LLaMA tokenizer.
-numpy
-torch == 2.1.1
-transformers >= 4.37.0 # Required for Qwen2
-xformers == 0.0.23
-fastapi
-uvicorn[standard]
-pydantic >= 2.0  # Required for OpenAI server.
-aioprometheus[starlette]
-pynvml == 11.5.0
-accelerate
-aiohttp
diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
index ee3f3e19b..292b6122e 100644
--- a/benchmarks/benchmark/tools/profile-generator/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/main.tf
@@ -52,14 +52,13 @@ resource "google_project_service" "cloudbuild" {
 
 module "latency-profile" {
   count  = var.targets.manual != null ? 1 : 0
-  source = "../latency-profile"
+  source = "./modules/latency-profile"
 
-  credentials_config                    = var.credentials_config
-  namespace                             = var.namespace
-  project_id                            = var.project_id
-  templates_path                        = var.templates_path
-  artifact_registry                     = var.artifact_registry
-  build_latency_profile_generator_image = false # Dont build image for each profile generator instance, only need to do once.
+  credentials_config = var.credentials_config
+  namespace          = var.namespace
+  project_id         = var.project_id
+  templates_path     = var.templates_path
+  artifact_registry  = var.artifact_registry
   inference_server = {
     name      = var.targets.manual.name
     tokenizer = var.targets.manual.tokenizer
diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf
similarity index 91%
rename from benchmarks/benchmark/tools/latency-profile/main.tf
rename to benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf
index 4a68d0cb2..3d8a410d9 100644
--- a/benchmarks/benchmark/tools/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf
@@ -44,19 +44,6 @@ data "google_client_config" "identity" {
   count = var.credentials_config.fleet_host != null ? 1 : 0
 }
 
-resource "google_project_service" "cloudbuild" {
-  count   = var.build_latency_profile_generator_image ? 1 : 0
-  project = var.project_id
-  service = "cloudbuild.googleapis.com"
-
-  timeouts {
-    create = "30m"
-    update = "40m"
-  }
-
-  disable_on_destroy = false
-}
-
 resource "kubernetes_manifest" "latency-profile-generator" {
   manifest = yamldecode(templatefile(local.latency-profile-generator-template, {
       namespace                                  = var.namespace
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
similarity index 100%
rename from benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
rename to benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
diff --git a/benchmarks/benchmark/tools/latency-profile/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/sample.tfvars
similarity index 100%
rename from benchmarks/benchmark/tools/latency-profile/sample.tfvars
rename to benchmarks/benchmark/tools/profile-generator/modules/latency-profile/sample.tfvars
diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf
similarity index 96%
rename from benchmarks/benchmark/tools/latency-profile/variables.tf
rename to benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf
index 1c6969a37..a5dec1259 100644
--- a/benchmarks/benchmark/tools/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf
@@ -58,12 +58,6 @@ variable "artifact_registry" {
   default     = null
 }
 
-variable "build_latency_profile_generator_image" {
-  description = "Whether latency profile generator image will be built or not"
-  type        = bool
-  default     = true
-}
-
 # Inference server configuration
 variable "inference_server" {
   type = object({

From 569ef6d90965888e73305235c8b7b93782973c94 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Fri, 23 Aug 2024 16:19:12 +0000
Subject: [PATCH 33/37] fmt

---
 .../modules/latency-profile/main.tf           | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf
index 3d8a410d9..781fd772e 100644
--- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf
@@ -46,19 +46,19 @@ data "google_client_config" "identity" {
 
 resource "kubernetes_manifest" "latency-profile-generator" {
   manifest = yamldecode(templatefile(local.latency-profile-generator-template, {
-      namespace                                  = var.namespace
-      artifact_registry                          = var.artifact_registry
-      inference_server_framework                 = var.inference_server.name
-      inference_server_service                   = var.inference_server.service.name
-      inference_server_service_port              = var.inference_server.service.port
-      tokenizer                                  = var.inference_server.tokenizer
-      latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
-      max_num_prompts                            = var.max_num_prompts
-      max_output_len                             = var.max_output_len
-      max_prompt_len                             = var.max_prompt_len
-      request_rates                              = join(",", [for number in var.request_rates : tostring(number)])
-      hugging_face_token_secret_list             = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
-      k8s_hf_secret_list                         = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
-      output_bucket                              = var.output_bucket
+    namespace                                  = var.namespace
+    artifact_registry                          = var.artifact_registry
+    inference_server_framework                 = var.inference_server.name
+    inference_server_service                   = var.inference_server.service.name
+    inference_server_service_port              = var.inference_server.service.port
+    tokenizer                                  = var.inference_server.tokenizer
+    latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
+    max_num_prompts                            = var.max_num_prompts
+    max_output_len                             = var.max_output_len
+    max_prompt_len                             = var.max_prompt_len
+    request_rates                              = join(",", [for number in var.request_rates : tostring(number)])
+    hugging_face_token_secret_list             = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
+    k8s_hf_secret_list                         = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
+    output_bucket                              = var.output_bucket
   }))
 }
\ No newline at end of file

From 93682d32b6a848166cfaf64d82d416b8a452ae39 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Fri, 23 Aug 2024 16:24:48 +0000
Subject: [PATCH 34/37] supports jetstream

---
 benchmarks/benchmark/tools/profile-generator/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/README.md b/benchmarks/benchmark/tools/profile-generator/README.md
index f7df0c738..cbae57c97 100644
--- a/benchmarks/benchmark/tools/profile-generator/README.md
+++ b/benchmarks/benchmark/tools/profile-generator/README.md
@@ -24,6 +24,7 @@ It currently supports the following frameworks:
 - text generation inference (tgi)
 - vllm
 - sax
+-jetstream
 
 ## Instructions
 
@@ -91,7 +92,7 @@ Fill out your `terraform.tfvars` with the desired model and server configuration
 - `project_id` - project id for enabling dependent services for building Latency Profile Generator artifacts
 - `artifact_registry` - artifact registry to upload Latency Profile Generator artifacts to
 - `build_latency_profile_generator_image` - Whether latency profile generator image will be built or not
-- `targets` - Which model servers are we targeting for benchmarking? Set `manual` if intending to benchmark a model server already in the cluster.
+- `targets` - Which model servers are we targeting for benchmarking? Set  the fields on `manual` if intending to benchmark a model server already in the cluster.
 - `output_bucket` - gcs bucket to write benchmarking metrics to.
 - `latency_profile_kubernetes_service_account` - service account giving access to latency profile generator to write to `output_bucket`
 - `k8s_hf_secret` - Name of secret for huggingface token stored in k8s

From b1a6b7b9e6846727fa0a56a33547d77a1bc75249 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Fri, 23 Aug 2024 16:26:31 +0000
Subject: [PATCH 35/37] Added comment

---
 benchmarks/benchmark/tools/profile-generator/sample.tfvars | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/benchmark/tools/profile-generator/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
index 68ad613fb..dea00ad56 100644
--- a/benchmarks/benchmark/tools/profile-generator/sample.tfvars
+++ b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
@@ -31,6 +31,7 @@ k8s_hf_secret                              = "hf-token"
 # Benchmark configuration for Locust Docker accessing inference server
 request_rates = [5, 10, 15, 20]
 
+# Model server configuration information
 targets = {
   manual = {
     name         = "your_model_server_name"

From 2ab541f4752b9e29120f9db00ed5fd31c03f4ef8 Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Fri, 23 Aug 2024 16:27:55 +0000
Subject: [PATCH 36/37] more accurate comment

---
 benchmarks/benchmark/tools/profile-generator/README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/README.md b/benchmarks/benchmark/tools/profile-generator/README.md
index cbae57c97..6d136bab9 100644
--- a/benchmarks/benchmark/tools/profile-generator/README.md
+++ b/benchmarks/benchmark/tools/profile-generator/README.md
@@ -174,5 +174,7 @@ terraform plan
 terraform apply
 ```
 
-A results file will appear in GCS bucket specified as `output_bucket` in input
-variables.
+The results can be viewed via running the following:
+```
+kubectl logs job/latency-profile-generator
+```

From f1d56d6d8e3cd5df677f12c06658a30d6f02431e Mon Sep 17 00:00:00 2001
From: Brendan Slabe <slabe@google.com>
Date: Fri, 23 Aug 2024 17:21:26 +0000
Subject: [PATCH 37/37] readd container folder

---
 .../profile-generator/container/Dockerfile    |  21 +
 .../container/benchmark_serving.py            | 469 ++++++++++++++++++
 .../container/latency_throughput_curve.sh     |  27 +
 .../container/requirements.txt                |  37 ++
 .../modules/latency-profile/main.tf           |   1 +
 5 files changed, 555 insertions(+)
 create mode 100644 benchmarks/benchmark/tools/profile-generator/container/Dockerfile
 create mode 100644 benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
 create mode 100644 benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh
 create mode 100644 benchmarks/benchmark/tools/profile-generator/container/requirements.txt

diff --git a/benchmarks/benchmark/tools/profile-generator/container/Dockerfile b/benchmarks/benchmark/tools/profile-generator/container/Dockerfile
new file mode 100644
index 000000000..6d49f511e
--- /dev/null
+++ b/benchmarks/benchmark/tools/profile-generator/container/Dockerfile
@@ -0,0 +1,21 @@
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
+
+RUN apt-get update -y \
+    && apt-get install -y python3-pip git vim curl wget
+RUN pip3 install --upgrade pip
+RUN pip install packaging torch transformers
+WORKDIR /workspace
+
+# install build and runtime dependencies
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+
+RUN pip install -U "huggingface_hub[cli]"
+
+RUN wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+COPY benchmark_serving.py benchmark_serving.py
+COPY latency_throughput_curve.sh latency_throughput_curve.sh
+
+RUN chmod +x latency_throughput_curve.sh
+RUN chmod +x benchmark_serving.py
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
new file mode 100644
index 000000000..a05226aa6
--- /dev/null
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -0,0 +1,469 @@
+r"""Benchmark LLM serving throughput and latency.
+This script is for sending requests with prompts to LLM server and benchmark
+the latency and throughput at various request rates. It is a modified version of
+https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_serving.py.
+It currently supports TGI, vLLM, Triton TensorRT-LLM and Saxml.
+"""
+
+import argparse
+import asyncio
+import json
+import random
+import time
+from typing import AsyncGenerator, List, Tuple
+
+import aiohttp
+import numpy as np
+from transformers import AutoTokenizer
+from transformers import PreTrainedTokenizerBase
+
+
+# (prompt len, output len, latency)
+REQUEST_LATENCY: List[Tuple[int, int, float]] = []
+
+MIN_SEQ_LEN = 4
+CLIENT_TIMEOUT_SEC = 3 * 60 * 60
+NEW_TEXT_KEY = "\nOutput:\n"
+
+
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    max_input_len: int,
+    max_output_len: int,
+    tokenizer: PreTrainedTokenizerBase,
+    use_dummy_text: bool,
+) -> List[Tuple[str, int, int]]:
+  """Samples requests from the dataset or creates dummy requests."""
+  if use_dummy_text:
+    dummy_prompt_token_ids = [0] * max_input_len
+    dummy_prompt = tokenizer.decode(dummy_prompt_token_ids)
+    dummy_requests = [(
+        dummy_prompt,
+        max_input_len,
+        max_output_len,
+    )] * num_requests
+    return dummy_requests
+
+  # Load the dataset.
+  with open(dataset_path) as f:
+    dataset = json.load(f)
+  # Filter out the conversations with less than 2 turns.
+  dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+  # Only keep the first two turns of each conversation.
+  dataset = [
+      (data["conversations"][0]["value"], data["conversations"][1]["value"])
+      for data in dataset
+  ]
+
+  # Tokenize the prompts and completions.
+  prompts = [prompt for prompt, _ in dataset]
+  prompt_token_ids = tokenizer(prompts).input_ids
+  completions = [completion for _, completion in dataset]
+  completion_token_ids = tokenizer(completions).input_ids
+  tokenized_dataset = []
+  for i in range(len(dataset)):
+    output_len = len(completion_token_ids[i])
+    tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+
+  # Filter out too long sequences.
+  filtered_dataset: List[Tuple[str, int, int]] = []
+  for prompt, prompt_token_ids, output_len in tokenized_dataset:
+    prompt_len = len(prompt_token_ids)
+    if prompt_len < MIN_SEQ_LEN or output_len < MIN_SEQ_LEN:
+      # Prune too short sequences.
+      # This is because TGI causes errors when the input or output length
+      # is too short.
+      continue
+    if prompt_len > max_input_len or output_len > max_output_len:
+      # Prune too long sequences.
+      continue
+    filtered_dataset.append((prompt, prompt_len, output_len))
+
+  # Sample the requests.
+  sampled_requests = random.sample(filtered_dataset, num_requests)
+  return sampled_requests
+
+
+async def get_request(
+    input_requests: List[Tuple[str, int, int]],
+    request_rate: float,
+) -> AsyncGenerator[Tuple[str, int, int], None]:
+  """Gets request async."""
+  input_requests = iter(input_requests)
+  for request in input_requests:
+    yield request
+
+    if request_rate == float("inf"):
+      # If the request rate is infinity, then we don't need to wait.
+      continue
+    # Sample the request interval from the exponential distribution.
+    interval = np.random.exponential(1.0 / request_rate)
+    # The next request will be sent after the interval.
+    await asyncio.sleep(interval)
+
+
+async def send_request(
+    backend: str,
+    api_url: str,
+    prompt: str,
+    prompt_len: int,
+    output_len: int,
+    best_of: int,
+    use_beam_search: bool,
+    top_k: int,
+    tokenizer: PreTrainedTokenizerBase,
+    sax_model: str,
+) -> None:
+  """Sends request to server."""
+  request_start_time = time.time()
+
+  headers = {"User-Agent": "Benchmark Client"}
+  if backend == "vllm":
+    pload = {
+        "prompt": prompt,
+        "n": 1,
+        "best_of": best_of,
+        "use_beam_search": use_beam_search,
+        "temperature": 0.0 if use_beam_search else 1.0,
+        "top_p": 1.0,
+        "max_tokens": output_len,
+        "ignore_eos": False,
+        "stream": False,
+    }
+  elif backend == "tgi":
+    assert not use_beam_search
+    params = {
+        "best_of": best_of,
+        "max_new_tokens": output_len,
+        "do_sample": True,
+    }
+    pload = {
+        "inputs": prompt,
+        "parameters": params,
+    }
+  elif backend == "naive_transformers":
+    # If max_length or top_k is not specified _MAX_LENGTH_DEFAULT = 200 and
+    # _TOP_K_DEFAULT = 10 in peft/handler.py will be used.
+    pload = {
+        "instances": [{
+            "prompt": prompt,
+            "max_length": output_len,
+            "top_k": top_k,
+        }]
+    }
+  elif backend == "tensorrt_llm_triton":
+    pload = {
+        "text_input": prompt,
+        "max_tokens": output_len,
+        "beam_width": 1 if not use_beam_search else best_of,
+        "temperature": 0.0 if use_beam_search else 1.0,
+        "top_p": 1.0,
+        "bad_words": "",
+        "stop_words": "",
+        "stream": False,
+    }
+  elif backend == "sax":
+    pload = {
+        "model": sax_model,
+        "prompt": prompt,
+        "n": 1,
+        "best_of": best_of,
+        "use_beam_search": use_beam_search,
+        "temperature": 0.0 if use_beam_search else 1.0,
+        "top_p": 1.0,
+        "top_k": 50,
+        "max_tokens": output_len,
+        "stream": False,
+    }
+  elif backend == "jetstream":
+    pload = {
+        "prompt": prompt,
+        "max_tokens": 1,
+    }
+  else:
+    raise ValueError(f"Unknown backend: {backend}")
+
+  # Set client timeout to be 3 hrs.
+  timeout = aiohttp.ClientTimeout(total=CLIENT_TIMEOUT_SEC)
+  async with aiohttp.ClientSession(timeout=timeout) as session:
+    while True:
+      async with session.post(api_url, headers=headers, json=pload) as response:
+        chunks = []
+        async for chunk, _ in response.content.iter_chunks():
+          chunks.append(chunk)
+      output = b"".join(chunks).decode("utf-8")
+      output = json.loads(output)
+
+      # Re-send the request if it failed.
+      if "error" not in output:
+        break
+
+  request_end_time = time.time()
+  # Naive HF transformers generation and TensorRT-LLM generation stops at EOS
+  # tokens and the generation may be shorter than the ground-truth output
+  # sequence length.
+  if backend == "naive_transformers":
+    complete_pred = output["predictions"][0][0]["generated_text"]
+    new_text_start_index = complete_pred.find(NEW_TEXT_KEY) + len(NEW_TEXT_KEY)
+    pred = complete_pred[new_text_start_index:]
+    output_token_ids = tokenizer(pred).input_ids
+    output_len = len(output_token_ids) - prompt_len
+  elif backend == "tensorrt_llm_triton":
+    output_token_ids = tokenizer(output["text_output"]).input_ids
+    output_len = len(output_token_ids)
+  elif backend == "sax":
+    output_token_ids = tokenizer(output["choices"][0]["text"]).input_ids
+    output_len = len(output_token_ids)
+  elif backend == "tgi":
+    output_token_ids = tokenizer(output["generated_text"]).input_ids
+    output_len = len(output_token_ids)
+  elif backend == "vllm":
+    total_token_ids = tokenizer(output["text"][0]).input_ids
+    new_total_len = len(total_token_ids)
+    output_len = new_total_len - prompt_len
+  elif backend == "jetstream":
+    output_token_ids = tokenizer(output["response"]).input_ids
+    output_len = len(output_token_ids)
+
+  request_latency = request_end_time - request_start_time
+  REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    input_requests: List[Tuple[str, int, int]],
+    best_of: int,
+    use_beam_search: bool,
+    request_rate: float,
+    top_k: int,
+    tokenizer: PreTrainedTokenizerBase,
+    sax_model: str,
+) -> None:
+  """Runs benchmark with asynchronous requests."""
+  tasks: List[asyncio.Task] = []
+  async for request in get_request(input_requests, request_rate):
+    prompt, prompt_len, output_len = request
+    task = asyncio.create_task(
+        send_request(
+            backend,
+            api_url,
+            prompt,
+            prompt_len,
+            output_len,
+            best_of,
+            use_beam_search,
+            top_k,
+            tokenizer,
+            sax_model,
+        )
+    )
+    tasks.append(task)
+  await asyncio.gather(*tasks)
+
+
+def main(args: argparse.Namespace):
+  print(args)
+  random.seed(args.seed)
+  np.random.seed(args.seed)
+
+  api_url = f"http://{args.host}:{args.port}/{args.endpoint}"
+  tokenizer = AutoTokenizer.from_pretrained(
+      args.tokenizer, trust_remote_code=args.trust_remote_code
+  )
+  input_requests = sample_requests(
+      args.dataset,
+      args.num_prompts,
+      args.max_input_length,
+      args.max_output_length,
+      tokenizer,
+      args.use_dummy_text,
+  )
+
+  benchmark_start_time = time.time()
+  asyncio.run(
+      benchmark(
+          args.backend,
+          api_url,
+          input_requests,
+          args.best_of,
+          args.use_beam_search,
+          args.request_rate,
+          args.top_k,
+          tokenizer,
+          args.sax_model,
+      )
+  )
+  benchmark_end_time = time.time()
+  benchmark_time = benchmark_end_time - benchmark_start_time
+  print(f"Total time: {benchmark_time:.2f} s")
+  print(f"Requests/min: {60 * args.num_prompts / benchmark_time:.2f}")
+
+  total_output_tokens = np.sum([output_len for _, output_len, _ in
+                                REQUEST_LATENCY])
+  output_tokens_per_min = 60 * total_output_tokens / benchmark_time
+  print(f"Output_tokens/min: {output_tokens_per_min:.2f}")
+
+  total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in
+                               REQUEST_LATENCY])
+  input_tokens_per_min = 60 * total_input_tokens / benchmark_time
+  print(f"Input_tokens/min: {input_tokens_per_min:.2f}")
+
+  total_tokens = total_input_tokens + total_output_tokens
+  tokens_per_min = 60 * total_tokens / benchmark_time
+  print(f"Tokens/min: {tokens_per_min:.2f}")
+
+  if args.machine_cost:
+    print(
+        "Cost $/1k tokens:"
+        f" {args.machine_cost * 1000 / (60 * output_tokens_per_min)}"
+    )
+  # NOTE: The latency below includes requests awaiting time on server side.
+  # It's not comparable with the model inference latency for batch size 1.
+  avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY])
+  print(
+      "Average seconds/request (includes waiting time on server):"
+      f" {avg_latency:.2f}"
+  )
+
+  avg_per_token_latency = np.mean([
+      latency / (prompt_len + output_len)
+      for prompt_len, output_len, latency in REQUEST_LATENCY
+  ])
+  print(
+      "Average milliseconds/token (includes waiting time on server):"
+      f" {1000 * avg_per_token_latency:.2f}"
+  )
+
+  avg_per_output_token_latency = np.mean(
+      [latency / output_len for _, output_len, latency in REQUEST_LATENCY]
+  )
+  print(
+      "Average milliseconds/output_token (includes waiting time on server):"
+      f" {1000 * avg_per_output_token_latency:.2f}"
+  )
+
+  avg_input_len = np.mean(
+      [prompt_len for prompt_len, _, _ in REQUEST_LATENCY]
+  )
+  print(
+      "Average input length:"
+      f" {avg_input_len:.2f}"
+  )
+
+  avg_output_len = np.mean(
+      [output_len for _, output_len, _ in REQUEST_LATENCY]
+  )
+  print(
+      "Average output length:"
+      f" {avg_output_len:.2f}"
+  )
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(
+      description="Benchmark the online serving throughput."
+  )
+  parser.add_argument(
+      "--backend",
+      type=str,
+      default="vllm",
+      choices=[
+          "vllm",
+          "tgi",
+          "naive_transformers",
+          "tensorrt_llm_triton",
+          "sax",
+          "jetstream"
+      ],
+  )
+  parser.add_argument(
+      "--sax_model",
+      type=str,
+      default="",
+      help="Model name to send request to at API server for SAX model server.",
+  )
+  parser.add_argument("--endpoint", type=str, default="generate")
+  parser.add_argument("--host", type=str, default="localhost")
+  parser.add_argument("--port", type=int, default=7080)
+  parser.add_argument("--dataset", type=str, help="Path to the dataset.")
+  parser.add_argument(
+      "--tokenizer",
+      type=str,
+      required=True,
+      help="Name or path of the tokenizer.",
+  )
+  parser.add_argument(
+      "--best-of",
+      type=int,
+      default=1,
+      help="Generates `best_of` sequences per prompt and returns the best one.",
+  )
+  parser.add_argument("--use-beam-search", action="store_true")
+  parser.add_argument(
+      "--num-prompts",
+      type=int,
+      default=1000,
+      help="Number of prompts to process.",
+  )
+  parser.add_argument(
+      "--max-input-length",
+      type=int,
+      default=1024,
+      help=(
+          "Maximum number of input tokens for filtering the benchmark dataset."
+      ),
+  )
+  parser.add_argument(
+      "--max-output-length",
+      type=int,
+      default=1024,
+      help=(
+          "Maximum number of input tokens for filtering the benchmark dataset."
+      ),
+  )
+  parser.add_argument(
+      "--top-k",
+      type=int,
+      default=32000,
+      help=(
+          "Number of candidate tokens that are considered at each step of the"
+          " generation process. 32000 is the vocab_size of Open-LLaMA and"
+          " LLaMA2 models."
+      ),
+  )
+  parser.add_argument(
+      "--request-rate",
+      type=float,
+      default=float("inf"),
+      help=(
+          "Number of requests per second. If this is inf, "
+          "then all the requests are sent at time 0. "
+          "Otherwise, we use Poisson process to synthesize "
+          "the request arrival times."
+      ),
+  )
+  parser.add_argument("--seed", type=int, default=0)
+  parser.add_argument(
+      "--trust-remote-code",
+      action="store_true",
+      help="trust remote code from huggingface",
+  )
+  parser.add_argument(
+      "--machine-cost",
+      type=float,
+      default=None,
+      help="Machine cost per hour including accelerators (if any)",
+  )
+  parser.add_argument(
+      "--use-dummy-text",
+      action="store_true",
+      help=(
+          "Whether to use dummy text with length defined by max_input_length"
+          " and max_output_length."
+      ),
+  )
+  cmd_args = parser.parse_args()
+  main(cmd_args)
+	
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh
new file mode 100644
index 000000000..9c9e5ccf5
--- /dev/null
+++ b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright 2024 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -o xtrace
+
+export IP=$IP
+
+huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential
+
+for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do
+  # TODO: Check if profile already exists, if so then skip
+  timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
+  output_file="latency-profile-${timestamp}.txt"
+  python3 benchmark_serving.py   --host="$IP"   --port="$PORT"   --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
+done
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt
new file mode 100644
index 000000000..739d46f7d
--- /dev/null
+++ b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt
@@ -0,0 +1,37 @@
+# formatting
+yapf==0.32.0
+toml==0.10.2
+ruff==0.1.5
+
+# type checking
+mypy==0.991
+types-PyYAML
+types-requests
+types-setuptools
+
+# testing
+pytest
+pytest-forked
+pytest-asyncio
+httpx
+einops # required for MPT
+flash_attn # required for HuggingFace's llama implementation
+openai
+requests
+
+# run
+ninja  # For faster builds.
+psutil
+ray >= 2.9
+sentencepiece  # Required for LLaMA tokenizer.
+numpy
+torch == 2.1.1
+transformers >= 4.37.0 # Required for Qwen2
+xformers == 0.0.23
+fastapi
+uvicorn[standard]
+pydantic >= 2.0  # Required for OpenAI server.
+aioprometheus[starlette]
+pynvml == 11.5.0
+accelerate
+aiohttp
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf
index 781fd772e..694e8c324 100644
--- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf
@@ -44,6 +44,7 @@ data "google_client_config" "identity" {
   count = var.credentials_config.fleet_host != null ? 1 : 0
 }
 
+
 resource "kubernetes_manifest" "latency-profile-generator" {
   manifest = yamldecode(templatefile(local.latency-profile-generator-template, {
     namespace                                  = var.namespace