configurable pipeline starting point, request rates configurable

GoogleCloudPlatform · Aug 15, 2024 · 44143e7 · 44143e7
1 parent 3e62239
commit 44143e7
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 4 deletions.
diff --git a/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/latency-profile/container/latency_throughput_curve.sh
@@ -19,9 +19,9 @@ export IP=$IP
 
 huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential
 
-timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
-output_file="latency-profile-${timestamp}.txt"
-for ((i = 1 ; i <= 2 ; i*=2 )); do
-  python3 benchmark_serving.py   --host="$IP"   --port="$PORT"   --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$i --backend="$BACKEND" --num-prompts=2 --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
+for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do
+  timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
+  output_file="latency-profile-${timestamp}.txt"
+  python3 benchmark_serving.py   --host="$IP"   --port="$PORT"   --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
 done
 
diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+## BEFORE APPLYING TEMPLATES
+
+# 1) Assure that we need to upload the new data point if either there is none of the existing one is unsatisfactory
+# 2) Use the `catalog generate` tool to generate the manifests and pipe them to `kubectl apply -f`, assure kubectl succeeds
 locals {
   templates = [
     for f in fileset(local.templates_path, "*tpl") :
@@ -38,6 +42,7 @@ locals {
       max_num_prompts                            = var.max_num_prompts
       max_output_len                             = var.max_output_len
       max_prompt_len                             = var.max_prompt_len
+      request_rates                              = join(",", [for number in var.request_rates : tostring(number)])
       tokenizer                                  = var.tokenizer
       hugging_face_token_b64                     = var.hugging_face_token_b64
       k8s_hf_secret_list                         = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]

diff --git a/...rks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/...rks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
@@ -37,6 +37,8 @@ spec:
               value: ${max_prompt_len}
             - name: OUTPUT_LENGTH
               value: ${max_output_len}
+            - name: REQUEST_RATES
+              value: ${request_rates}
             - name: OUTPUT_BUCKET
               value: ${output_bucket}
             - name: HF_TOKEN

diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -118,6 +118,13 @@ variable "max_prompt_len" {
   }
 }
 
+variable "request_rates" {
+  description = ""
+  type        = list(number)
+  default     = [1, 2]
+  nullable    = false
+}
+
 variable "tokenizer" {
   description = "Benchmark server configuration for tokenizer."
   type        = string
@@ -155,4 +162,55 @@ variable "hugging_face_token_b64" {
   description = "Base 64 encoded hugging face token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
   type        = string
   nullable    = false
+}
+
+variable "pipeline_config" {
+  description = "All combinations of model/model_server/accelerators to benchmark"
+  type = object({
+    valid_models       = list(string)
+    valid_accelerators = list(string)
+    request_rates      = list(number)
+
+    config = list(object({
+      model_server = string # Model server name
+      model_server_configs = list(object({
+        models = list(string) # model name
+        model_configs = list(object({
+          accelerators = list(string) # Accelerator name
+          accelerator_configs = list(object({
+            accelerator_count = number # Number of accelerators
+          }))
+        }))
+      }))
+    }))
+  })
+
+  validation {
+    condition = alltrue([
+      for cfg in var.pipeline_config.config : alltrue([
+        for model_server_config in cfg.model_server_configs : (
+          alltrue([
+            for model_config in model_server_config.model_configs :
+            alltrue([for accelerator in model_config.accelerators :
+            contains(var.pipeline_config.valid_accelerators, accelerator)])
+          ])
+        )
+      ])
+    ])
+    error_message = "Each accelerator must be in the valid_accelerators list."
+  }
+
+  validation {
+    condition = alltrue([
+      for cfg in var.pipeline_config.config : alltrue([
+        for model_server_config in cfg.model_server_configs : (
+          alltrue([
+            for model in model_server_config.models :
+            contains(var.pipeline_config.valid_models, model)
+          ])
+        )
+      ])
+    ])
+    error_message = "Each model must be in the valid_models list."
+  }
 }
diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf