Adding support vLLM openai entrypoint benchmarking script (#793)

* changes to support openai entrypoint * cat command to show results * defaulted entrypoint in vllm case, updated docs and versions * readded comment * fixed lint issue * ran terraform fmt * fixed comments * changes to support jetstream
GoogleCloudPlatform · Sep 3, 2024 · c872599 · c872599
1 parent b22273b
commit c872599
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 10 deletions.
diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -114,13 +114,15 @@ async def send_request(
     top_k: int,
     tokenizer: PreTrainedTokenizerBase,
     sax_model: str,
+    model: str,
 ) -> None:
   """Sends request to server."""
   request_start_time = time.time()
 
   headers = {"User-Agent": "Benchmark Client"}
   if backend == "vllm":
     pload = {
+        "model": model,
         "prompt": prompt,
         "n": 1,
         "best_of": best_of,
@@ -179,7 +181,7 @@ async def send_request(
   elif backend == "jetstream":
     pload = {
         "prompt": prompt,
-        "max_tokens": 1,
+        "max_tokens": output_len,
     }
   else:
     raise ValueError(f"Unknown backend: {backend}")
@@ -219,9 +221,8 @@ async def send_request(
     output_token_ids = tokenizer(output["generated_text"]).input_ids
     output_len = len(output_token_ids)
   elif backend == "vllm":
-    total_token_ids = tokenizer(output["text"][0]).input_ids
-    new_total_len = len(total_token_ids)
-    output_len = new_total_len - prompt_len
+    output_token_ids = tokenizer(output["choices"][0]["text"]).input_ids
+    output_len = len(output_token_ids)
   elif backend == "jetstream":
     output_token_ids = tokenizer(output["response"]).input_ids
     output_len = len(output_token_ids)
@@ -240,6 +241,7 @@ async def benchmark(
     top_k: int,
     tokenizer: PreTrainedTokenizerBase,
     sax_model: str,
+    model: str,
 ) -> None:
   """Runs benchmark with asynchronous requests."""
   tasks: List[asyncio.Task] = []
@@ -257,6 +259,7 @@ async def benchmark(
             top_k,
             tokenizer,
             sax_model,
+            model,
         )
     )
     tasks.append(task)
@@ -268,7 +271,13 @@ def main(args: argparse.Namespace):
   random.seed(args.seed)
   np.random.seed(args.seed)
 
-  api_url = f"http://{args.host}:{args.port}/{args.endpoint}"
+  endpoint = (
+    "v1/completions"
+    if args.backend == "vllm"
+    else args.endpoint
+)
+
+  api_url = f"http://{args.host}:{args.port}/{endpoint}"
   tokenizer = AutoTokenizer.from_pretrained(
       args.tokenizer, trust_remote_code=args.trust_remote_code
   )
@@ -293,6 +302,7 @@ def main(args: argparse.Namespace):
           args.top_k,
           tokenizer,
           args.sax_model,
+          args.model,
       )
   )
   benchmark_end_time = time.time()
@@ -388,6 +398,11 @@ def main(args: argparse.Namespace):
   parser.add_argument("--host", type=str, default="localhost")
   parser.add_argument("--port", type=int, default=7080)
   parser.add_argument("--dataset", type=str, help="Path to the dataset.")
+  parser.add_argument(
+    "--model",
+    type=str,
+    help="Name of the model.",
+  )
   parser.add_argument(
       "--tokenizer",
       type=str,

diff --git a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh
@@ -23,5 +23,8 @@ for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do
   # TODO: Check if profile already exists, if so then skip
   timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
   output_file="latency-profile-${timestamp}.txt"
-  python3 benchmark_serving.py   --host="$IP"   --port="$PORT"   --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
-done
+  python3 benchmark_serving.py   --host="$IP"   --port="$PORT"   --model="$TOKENIZER" --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
+  cat $output_file
+  sleep 5 # wait 5 seconds before next run
+done
+
diff --git a/benchmarks/benchmark/tools/profile-generator/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
@@ -29,7 +29,8 @@ output_bucket                              = "your_project_id-benchmark-output-b
 k8s_hf_secret                              = "hf-token"
 
 # Benchmark configuration for Locust Docker accessing inference server
-request_rates = [5, 10, 15, 20]
+request_rates     = [5, 10, 15, 20]
+artifact_registry = "your_project_artifact_registry"
 
 # Model server configuration information
 targets = {
@@ -39,4 +40,4 @@ targets = {
     service_port = "your_model_service_service_port"
     tokenizer    = "your_tokenizer"
   }
-}
+}
diff --git a/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl
@@ -51,7 +51,7 @@ spec:
         - name: vllm
           ports:
             - containerPort: 80
-          image: "vllm/vllm-openai:v0.3.3"
+          image: "vllm/vllm-openai:v0.5.5"
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args: ["--model", "${model_id}", "--tensor-parallel-size", "${gpu_count}", "--port", "80", "--swap-space", "${swap_space}", "--disable-log-requests"]
           env: