From 68cd83f9d0b1a6789a3738b3bb3e43f8a45d63cf Mon Sep 17 00:00:00 2001
From: Linkun Chen <lkchen@github.com>
Date: Thu, 24 Oct 2024 15:30:39 -0700
Subject: [PATCH 1/9] Refactor benchmark_throughput.py

* Give the request tuple a name
* Add helper message for --dataset flag

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/benchmark_throughput.py | 70 ++++++++++++++++++++----------
 1 file changed, 47 insertions(+), 23 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index ee41c8ea38382..c4b8c3822edd2 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -15,16 +15,34 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
     build_async_engine_client_from_engine_args)
+from vllm.inputs import SingletonPrompt
+from vllm.multimodal import MultiModalDataDict
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
 
 
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g., images).
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+    """
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[MultiModalDataDict] = None
+
+
 def sample_requests(
     dataset_path: str,
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
+) -> List[SampleRequest]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
 
@@ -41,7 +59,7 @@ def sample_requests(
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
+    filtered_dataset: List[SampleRequest] = []
     for i in range(len(dataset)):
         if len(filtered_dataset) == num_requests:
             break
@@ -60,13 +78,16 @@ def sample_requests(
         if prompt_len > 1024 or prompt_len + output_len > 2048:
             # Prune too long sequences.
             continue
-        filtered_dataset.append((prompt, prompt_len, output_len))
+        filtered_dataset.append(
+            SampleRequest(prompt=prompt,
+                          prompt_len=prompt_len,
+                          expected_output_len=output_len))
 
     return filtered_dataset
 
 
 def run_vllm(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
     n: int,
     engine_args: EngineArgs,
 ) -> float:
@@ -74,17 +95,17 @@ def run_vllm(
     llm = LLM(**dataclasses.asdict(engine_args))
 
     # Add the requests to the engine.
-    prompts: List[str] = []
+    prompts: List[SingletonPrompt] = []
     sampling_params: List[SamplingParams] = []
-    for prompt, _, output_len in requests:
-        prompts.append(prompt)
+    for request in requests:
+        prompts.append(request.prompt)
         sampling_params.append(
             SamplingParams(
                 n=n,
                 temperature=1.0,
                 top_p=1.0,
                 ignore_eos=True,
-                max_tokens=output_len,
+                max_tokens=request.expected_output_len,
             ))
 
     use_beam_search = False
@@ -94,11 +115,11 @@ def run_vllm(
         llm.generate(prompts, sampling_params, use_tqdm=True)
         end = time.perf_counter()
     else:
-        prompts = [prompt for prompt, _, _ in requests]
+        prompts = [request.prompt for request in requests]
         # output_len should be the same for all requests.
         output_len = requests[0][2]
-        for prompt, input_len, _output_len in requests:
-            assert _output_len == output_len
+        for request in requests:
+            assert request.expected_output_len == output_len
         start = time.perf_counter()
         llm.beam_search(
             prompts,
@@ -112,7 +133,7 @@ def run_vllm(
 
 
 async def run_vllm_async(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
     n: int,
     engine_args: AsyncEngineArgs,
     disable_frontend_multiprocessing: bool = False,
@@ -123,17 +144,17 @@ async def run_vllm_async(
             engine_args, disable_frontend_multiprocessing) as llm:
 
         # Add the requests to the engine.
-        prompts: List[str] = []
+        prompts: List[SingletonPrompt] = []
         sampling_params: List[SamplingParams] = []
-        for prompt, _, output_len in requests:
-            prompts.append(prompt)
+        for request in requests:
+            prompts.append(request.prompt)
             sampling_params.append(
                 SamplingParams(
                     n=n,
                     temperature=1.0,
                     top_p=1.0,
                     ignore_eos=True,
-                    max_tokens=output_len,
+                    max_tokens=request.expected_output_len,
                 ))
 
         generators = []
@@ -149,7 +170,7 @@ async def run_vllm_async(
 
 
 def run_hf(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
     model: str,
     tokenizer: PreTrainedTokenizerBase,
     n: int,
@@ -207,14 +228,14 @@ def run_hf(
 
 
 def run_mii(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
     model: str,
     tensor_parallel_size: int,
     output_len: int,
 ) -> float:
     from mii import client, serve
     llm = serve(model, tensor_parallel=tensor_parallel_size)
-    prompts = [prompt for prompt, _, _ in requests]
+    prompts = [request.prompt for request in requests]
 
     start = time.perf_counter()
     llm.generate(prompts, max_new_tokens=output_len)
@@ -270,9 +291,10 @@ def main(args: argparse.Namespace):
                                args.output_len)
     else:
         raise ValueError(f"Unknown backend: {args.backend}")
-    total_num_tokens = sum(prompt_len + output_len
-                           for _, prompt_len, output_len in requests)
-    total_output_tokens = sum(output_len for _, _, output_len in requests)
+    total_num_tokens = sum(request.prompt_len + request.expected_output_len
+                           for request in requests)
+    total_output_tokens = sum(request.expected_output_len
+                              for request in requests)
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
           f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
@@ -299,7 +321,9 @@ def main(args: argparse.Namespace):
     parser.add_argument("--dataset",
                         type=str,
                         default=None,
-                        help="Path to the dataset.")
+                        help="Path to the dataset. The dataset is expected to "
+                        "be a json in form of List[Dict[..., conversations: "
+                        "List[Dict[..., value: <prompt_or_response>]]]]")
     parser.add_argument("--input-len",
                         type=int,
                         default=None,

From 9329d8d93faa9db23842d9a5f2db422888011398 Mon Sep 17 00:00:00 2001
From: Linkun Chen <lkchen@github.com>
Date: Fri, 25 Oct 2024 12:14:54 -0700
Subject: [PATCH 2/9] Refactor benchmark_throughput.py to pass TextPrompt
 instead of string

* This is preparation to support multi-modality input, by reusing existing TextPrompt structure
* no significant metrics diff, see below
 - before: Throughput: 13.99 requests/s, 2933.11 total tokens/s, 2758.10 output tokens/s
 - after: Throughput: 13.99 requests/s, 2932.69 total tokens/s, 2757.70 output tokens/s
 - test command: `python benchmarks/benchmark_throughput.py --model mistral-community/pixtral-12b  --max-model-len=8192 --dataset ../sharegpt4v_instruct_gpt4-vision_cap100k.json`

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/benchmark_throughput.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c4b8c3822edd2..21a4a71b3d1d2 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -15,7 +15,7 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
     build_async_engine_client_from_engine_args)
-from vllm.inputs import SingletonPrompt
+from vllm.inputs import TextPrompt
 from vllm.multimodal import MultiModalDataDict
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
@@ -95,10 +95,10 @@ def run_vllm(
     llm = LLM(**dataclasses.asdict(engine_args))
 
     # Add the requests to the engine.
-    prompts: List[SingletonPrompt] = []
+    prompts: List[TextPrompt] = []
     sampling_params: List[SamplingParams] = []
     for request in requests:
-        prompts.append(request.prompt)
+        prompts.append(TextPrompt(prompt=request.prompt))
         sampling_params.append(
             SamplingParams(
                 n=n,
@@ -144,10 +144,10 @@ async def run_vllm_async(
             engine_args, disable_frontend_multiprocessing) as llm:
 
         # Add the requests to the engine.
-        prompts: List[SingletonPrompt] = []
+        prompts: List[TextPrompt] = []
         sampling_params: List[SamplingParams] = []
         for request in requests:
-            prompts.append(request.prompt)
+            prompts.append(TextPrompt(prompt=request.prompt))
             sampling_params.append(
                 SamplingParams(
                     n=n,

From 2623fea07f9af320e9d823f3c924241e86750f42 Mon Sep 17 00:00:00 2001
From: Linkun Chen <lkchen@github.com>
Date: Fri, 25 Oct 2024 13:32:30 -0700
Subject: [PATCH 3/9] Update benchmark_throughput.py to support image input

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/benchmark_throughput.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 21a4a71b3d1d2..3c30aad87ab11 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -7,6 +7,7 @@
 from typing import List, Optional, Tuple
 
 import torch
+from PIL import Image
 import uvloop
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
@@ -51,23 +52,20 @@ def sample_requests(
         dataset = json.load(f)
     # Filter out the conversations with less than 2 turns.
     dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
-
     # Shuffle the dataset.
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
     filtered_dataset: List[SampleRequest] = []
-    for i in range(len(dataset)):
+    for data in dataset:
         if len(filtered_dataset) == num_requests:
             break
 
+        # Only keep the first two turns of each conversation.
+        prompt = data["conversations"][0]["value"]
+        completion = data["conversations"][1]["value"]
         # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
         prompt_token_ids = tokenizer(prompt).input_ids
-        completion = dataset[i][1]
         completion_token_ids = tokenizer(completion).input_ids
         prompt_len = len(prompt_token_ids)
         output_len = len(completion_token_ids
@@ -82,6 +80,11 @@ def sample_requests(
             SampleRequest(prompt=prompt,
                           prompt_len=prompt_len,
                           expected_output_len=output_len))
+        if "image" in data:
+            filtered_dataset[-1].multi_modal_data = filtered_dataset[-1].multi_modal_data or {}
+            image_path = data["image"]
+            assert isinstance(image_path, str)
+            filtered_dataset[-1].multi_modal_data["image"] = Image.open(image_path).convert("RGB")
 
     return filtered_dataset
 
@@ -98,7 +101,7 @@ def run_vllm(
     prompts: List[TextPrompt] = []
     sampling_params: List[SamplingParams] = []
     for request in requests:
-        prompts.append(TextPrompt(prompt=request.prompt))
+        prompts.append(TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data))
         sampling_params.append(
             SamplingParams(
                 n=n,

From 2378563beed5cb0371e466e8bea6c5a145d6f375 Mon Sep 17 00:00:00 2001
From: Linkun Chen <lkchen@github.com>
Date: Fri, 25 Oct 2024 13:39:51 -0700
Subject: [PATCH 4/9] dbg

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/benchmark_throughput.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 3c30aad87ab11..a4489b2fa266d 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -80,6 +80,7 @@ def sample_requests(
             SampleRequest(prompt=prompt,
                           prompt_len=prompt_len,
                           expected_output_len=output_len))
+        print(data)
         if "image" in data:
             filtered_dataset[-1].multi_modal_data = filtered_dataset[-1].multi_modal_data or {}
             image_path = data["image"]
@@ -102,6 +103,7 @@ def run_vllm(
     sampling_params: List[SamplingParams] = []
     for request in requests:
         prompts.append(TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data))
+        print(prompts[-1])
         sampling_params.append(
             SamplingParams(
                 n=n,

From 917ccb30a7b04083c8b38e81053f0c3cb80ec3ff Mon Sep 17 00:00:00 2001
From: Linkun Chen <github@lkchen.net>
Date: Mon, 28 Oct 2024 16:53:20 -0700
Subject: [PATCH 5/9] fix lint

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/benchmark_throughput.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index a4489b2fa266d..fafc7d2f96087 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -4,7 +4,7 @@
 import json
 import random
 import time
-from typing import List, Optional, Tuple
+from typing import List, Optional
 
 import torch
 from PIL import Image
@@ -28,7 +28,8 @@ class SampleRequest:
 
     Attributes:
         prompt: The input text prompt for the model.
-        multi_modal_data: Optional dictionary containing multi-modal data (e.g., images).
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
         prompt_len: The length of the prompt in tokens.
         expected_output_len: The expected length of the output in tokens.
     """

From 6cb2fa7c81cfe790dbf26106b6631fafcc51b651 Mon Sep 17 00:00:00 2001
From: Linkun Chen <github@lkchen.net>
Date: Mon, 28 Oct 2024 16:58:48 -0700
Subject: [PATCH 6/9] fix lint

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/benchmark_throughput.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index fafc7d2f96087..98526e0a3f364 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -81,12 +81,13 @@ def sample_requests(
             SampleRequest(prompt=prompt,
                           prompt_len=prompt_len,
                           expected_output_len=output_len))
-        print(data)
         if "image" in data:
-            filtered_dataset[-1].multi_modal_data = filtered_dataset[-1].multi_modal_data or {}
+            filtered_dataset[-1].multi_modal_data = filtered_dataset[
+                -1].multi_modal_data or {}
             image_path = data["image"]
             assert isinstance(image_path, str)
-            filtered_dataset[-1].multi_modal_data["image"] = Image.open(image_path).convert("RGB")
+            filtered_dataset[-1].multi_modal_data["image"] = Image.open(
+                image_path).convert("RGB")
 
     return filtered_dataset
 
@@ -103,8 +104,9 @@ def run_vllm(
     prompts: List[TextPrompt] = []
     sampling_params: List[SamplingParams] = []
     for request in requests:
-        prompts.append(TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data))
-        print(prompts[-1])
+        prompts.append(
+            TextPrompt(prompt=request.prompt,
+                       multi_modal_data=request.multi_modal_data))
         sampling_params.append(
             SamplingParams(
                 n=n,

From a0199b52c5dcdee6f996aa91fd8f5ebcf7e38d11 Mon Sep 17 00:00:00 2001
From: Linkun Chen <github+anyscale@lkchen.net>
Date: Tue, 29 Oct 2024 12:21:26 -0700
Subject: [PATCH 7/9] fix sort

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/benchmark_throughput.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 98526e0a3f364..87c42a15e1249 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -7,8 +7,8 @@
 from typing import List, Optional
 
 import torch
-from PIL import Image
 import uvloop
+from PIL import Image
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)

From eb6e01bf62996b246ee498f1651df255de780ed8 Mon Sep 17 00:00:00 2001
From: Linkun Chen <github+anyscale@lkchen.net>
Date: Tue, 29 Oct 2024 13:41:25 -0700
Subject: [PATCH 8/9] Revert "Update benchmark_throughput.py to support image
 input"

This reverts commit 2623fea07f9af320e9d823f3c924241e86750f42.

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/benchmark_throughput.py | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 87c42a15e1249..32c3b54c9651f 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -8,7 +8,6 @@
 
 import torch
 import uvloop
-from PIL import Image
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
@@ -53,20 +52,23 @@ def sample_requests(
         dataset = json.load(f)
     # Filter out the conversations with less than 2 turns.
     dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [(data["conversations"][0]["value"],
+                data["conversations"][1]["value"]) for data in dataset]
+
     # Shuffle the dataset.
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
     filtered_dataset: List[SampleRequest] = []
-    for data in dataset:
+    for i in range(len(dataset)):
         if len(filtered_dataset) == num_requests:
             break
 
-        # Only keep the first two turns of each conversation.
-        prompt = data["conversations"][0]["value"]
-        completion = data["conversations"][1]["value"]
         # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
         prompt_token_ids = tokenizer(prompt).input_ids
+        completion = dataset[i][1]
         completion_token_ids = tokenizer(completion).input_ids
         prompt_len = len(prompt_token_ids)
         output_len = len(completion_token_ids
@@ -81,13 +83,6 @@ def sample_requests(
             SampleRequest(prompt=prompt,
                           prompt_len=prompt_len,
                           expected_output_len=output_len))
-        if "image" in data:
-            filtered_dataset[-1].multi_modal_data = filtered_dataset[
-                -1].multi_modal_data or {}
-            image_path = data["image"]
-            assert isinstance(image_path, str)
-            filtered_dataset[-1].multi_modal_data["image"] = Image.open(
-                image_path).convert("RGB")
 
     return filtered_dataset
 
@@ -104,9 +99,7 @@ def run_vllm(
     prompts: List[TextPrompt] = []
     sampling_params: List[SamplingParams] = []
     for request in requests:
-        prompts.append(
-            TextPrompt(prompt=request.prompt,
-                       multi_modal_data=request.multi_modal_data))
+        prompts.append(TextPrompt(prompt=request.prompt))
         sampling_params.append(
             SamplingParams(
                 n=n,

From 103225bc392f01b119bf025843254ae5c954007f Mon Sep 17 00:00:00 2001
From: Linkun Chen <github+anyscale@lkchen.net>
Date: Mon, 4 Nov 2024 10:32:31 -0800
Subject: [PATCH 9/9] Use named struct (instead of tuple) if dataset not
 provided

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/benchmark_throughput.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 32c3b54c9651f..262b8652e49ff 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -265,8 +265,12 @@ def main(args: argparse.Namespace):
         else:
             raise ValueError(
                 f"Failed to synthesize a prompt with {args.input_len} tokens.")
-        requests = [(prompt, args.input_len, args.output_len)
-                    for _ in range(args.num_prompts)]
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=args.input_len,
+                          expected_output_len=args.output_len)
+            for _ in range(args.num_prompts)
+        ]
     else:
         requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
                                    args.output_len)