From 68cd83f9d0b1a6789a3738b3bb3e43f8a45d63cf Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Thu, 24 Oct 2024 15:30:39 -0700 Subject: [PATCH 1/9] Refactor benchmark_throughput.py * Give the request tuple a name * Add helper message for --dataset flag Signed-off-by: Linkun Chen --- benchmarks/benchmark_throughput.py | 70 ++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 23 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index ee41c8ea38382..c4b8c3822edd2 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -15,16 +15,34 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) +from vllm.inputs import SingletonPrompt +from vllm.multimodal import MultiModalDataDict from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser, merge_async_iterators +@dataclasses.dataclass +class SampleRequest: + """A class representing a single inference request for benchmarking. + + Attributes: + prompt: The input text prompt for the model. + multi_modal_data: Optional dictionary containing multi-modal data (e.g., images). + prompt_len: The length of the prompt in tokens. + expected_output_len: The expected length of the output in tokens. + """ + prompt: str + prompt_len: int + expected_output_len: int + multi_modal_data: Optional[MultiModalDataDict] = None + + def sample_requests( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, fixed_output_len: Optional[int], -) -> List[Tuple[str, int, int]]: +) -> List[SampleRequest]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -41,7 +59,7 @@ def sample_requests( random.shuffle(dataset) # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] + filtered_dataset: List[SampleRequest] = [] for i in range(len(dataset)): if len(filtered_dataset) == num_requests: break @@ -60,13 +78,16 @@ def sample_requests( if prompt_len > 1024 or prompt_len + output_len > 2048: # Prune too long sequences. continue - filtered_dataset.append((prompt, prompt_len, output_len)) + filtered_dataset.append( + SampleRequest(prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len)) return filtered_dataset def run_vllm( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], n: int, engine_args: EngineArgs, ) -> float: @@ -74,17 +95,17 @@ def run_vllm( llm = LLM(**dataclasses.asdict(engine_args)) # Add the requests to the engine. - prompts: List[str] = [] + prompts: List[SingletonPrompt] = [] sampling_params: List[SamplingParams] = [] - for prompt, _, output_len in requests: - prompts.append(prompt) + for request in requests: + prompts.append(request.prompt) sampling_params.append( SamplingParams( n=n, temperature=1.0, top_p=1.0, ignore_eos=True, - max_tokens=output_len, + max_tokens=request.expected_output_len, )) use_beam_search = False @@ -94,11 +115,11 @@ def run_vllm( llm.generate(prompts, sampling_params, use_tqdm=True) end = time.perf_counter() else: - prompts = [prompt for prompt, _, _ in requests] + prompts = [request.prompt for request in requests] # output_len should be the same for all requests. output_len = requests[0][2] - for prompt, input_len, _output_len in requests: - assert _output_len == output_len + for request in requests: + assert request.expected_output_len == output_len start = time.perf_counter() llm.beam_search( prompts, @@ -112,7 +133,7 @@ def run_vllm( async def run_vllm_async( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], n: int, engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, @@ -123,17 +144,17 @@ async def run_vllm_async( engine_args, disable_frontend_multiprocessing) as llm: # Add the requests to the engine. - prompts: List[str] = [] + prompts: List[SingletonPrompt] = [] sampling_params: List[SamplingParams] = [] - for prompt, _, output_len in requests: - prompts.append(prompt) + for request in requests: + prompts.append(request.prompt) sampling_params.append( SamplingParams( n=n, temperature=1.0, top_p=1.0, ignore_eos=True, - max_tokens=output_len, + max_tokens=request.expected_output_len, )) generators = [] @@ -149,7 +170,7 @@ async def run_vllm_async( def run_hf( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], model: str, tokenizer: PreTrainedTokenizerBase, n: int, @@ -207,14 +228,14 @@ def run_hf( def run_mii( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], model: str, tensor_parallel_size: int, output_len: int, ) -> float: from mii import client, serve llm = serve(model, tensor_parallel=tensor_parallel_size) - prompts = [prompt for prompt, _, _ in requests] + prompts = [request.prompt for request in requests] start = time.perf_counter() llm.generate(prompts, max_new_tokens=output_len) @@ -270,9 +291,10 @@ def main(args: argparse.Namespace): args.output_len) else: raise ValueError(f"Unknown backend: {args.backend}") - total_num_tokens = sum(prompt_len + output_len - for _, prompt_len, output_len in requests) - total_output_tokens = sum(output_len for _, _, output_len in requests) + total_num_tokens = sum(request.prompt_len + request.expected_output_len + for request in requests) + total_output_tokens = sum(request.expected_output_len + for request in requests) print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " f"{total_output_tokens / elapsed_time:.2f} output tokens/s") @@ -299,7 +321,9 @@ def main(args: argparse.Namespace): parser.add_argument("--dataset", type=str, default=None, - help="Path to the dataset.") + help="Path to the dataset. The dataset is expected to " + "be a json in form of List[Dict[..., conversations: " + "List[Dict[..., value: ]]]]") parser.add_argument("--input-len", type=int, default=None, From 9329d8d93faa9db23842d9a5f2db422888011398 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Fri, 25 Oct 2024 12:14:54 -0700 Subject: [PATCH 2/9] Refactor benchmark_throughput.py to pass TextPrompt instead of string * This is preparation to support multi-modality input, by reusing existing TextPrompt structure * no significant metrics diff, see below - before: Throughput: 13.99 requests/s, 2933.11 total tokens/s, 2758.10 output tokens/s - after: Throughput: 13.99 requests/s, 2932.69 total tokens/s, 2757.70 output tokens/s - test command: `python benchmarks/benchmark_throughput.py --model mistral-community/pixtral-12b --max-model-len=8192 --dataset ../sharegpt4v_instruct_gpt4-vision_cap100k.json` Signed-off-by: Linkun Chen --- benchmarks/benchmark_throughput.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index c4b8c3822edd2..21a4a71b3d1d2 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -15,7 +15,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) -from vllm.inputs import SingletonPrompt +from vllm.inputs import TextPrompt from vllm.multimodal import MultiModalDataDict from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser, merge_async_iterators @@ -95,10 +95,10 @@ def run_vllm( llm = LLM(**dataclasses.asdict(engine_args)) # Add the requests to the engine. - prompts: List[SingletonPrompt] = [] + prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] for request in requests: - prompts.append(request.prompt) + prompts.append(TextPrompt(prompt=request.prompt)) sampling_params.append( SamplingParams( n=n, @@ -144,10 +144,10 @@ async def run_vllm_async( engine_args, disable_frontend_multiprocessing) as llm: # Add the requests to the engine. - prompts: List[SingletonPrompt] = [] + prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] for request in requests: - prompts.append(request.prompt) + prompts.append(TextPrompt(prompt=request.prompt)) sampling_params.append( SamplingParams( n=n, From 2623fea07f9af320e9d823f3c924241e86750f42 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Fri, 25 Oct 2024 13:32:30 -0700 Subject: [PATCH 3/9] Update benchmark_throughput.py to support image input Signed-off-by: Linkun Chen --- benchmarks/benchmark_throughput.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 21a4a71b3d1d2..3c30aad87ab11 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -7,6 +7,7 @@ from typing import List, Optional, Tuple import torch +from PIL import Image import uvloop from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, @@ -51,23 +52,20 @@ def sample_requests( dataset = json.load(f) # Filter out the conversations with less than 2 turns. dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Only keep the first two turns of each conversation. - dataset = [(data["conversations"][0]["value"], - data["conversations"][1]["value"]) for data in dataset] - # Shuffle the dataset. random.shuffle(dataset) # Filter out sequences that are too long or too short filtered_dataset: List[SampleRequest] = [] - for i in range(len(dataset)): + for data in dataset: if len(filtered_dataset) == num_requests: break + # Only keep the first two turns of each conversation. + prompt = data["conversations"][0]["value"] + completion = data["conversations"][1]["value"] # Tokenize the prompts and completions. - prompt = dataset[i][0] prompt_token_ids = tokenizer(prompt).input_ids - completion = dataset[i][1] completion_token_ids = tokenizer(completion).input_ids prompt_len = len(prompt_token_ids) output_len = len(completion_token_ids @@ -82,6 +80,11 @@ def sample_requests( SampleRequest(prompt=prompt, prompt_len=prompt_len, expected_output_len=output_len)) + if "image" in data: + filtered_dataset[-1].multi_modal_data = filtered_dataset[-1].multi_modal_data or {} + image_path = data["image"] + assert isinstance(image_path, str) + filtered_dataset[-1].multi_modal_data["image"] = Image.open(image_path).convert("RGB") return filtered_dataset @@ -98,7 +101,7 @@ def run_vllm( prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] for request in requests: - prompts.append(TextPrompt(prompt=request.prompt)) + prompts.append(TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data)) sampling_params.append( SamplingParams( n=n, From 2378563beed5cb0371e466e8bea6c5a145d6f375 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Fri, 25 Oct 2024 13:39:51 -0700 Subject: [PATCH 4/9] dbg Signed-off-by: Linkun Chen --- benchmarks/benchmark_throughput.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 3c30aad87ab11..a4489b2fa266d 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -80,6 +80,7 @@ def sample_requests( SampleRequest(prompt=prompt, prompt_len=prompt_len, expected_output_len=output_len)) + print(data) if "image" in data: filtered_dataset[-1].multi_modal_data = filtered_dataset[-1].multi_modal_data or {} image_path = data["image"] @@ -102,6 +103,7 @@ def run_vllm( sampling_params: List[SamplingParams] = [] for request in requests: prompts.append(TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data)) + print(prompts[-1]) sampling_params.append( SamplingParams( n=n, From 917ccb30a7b04083c8b38e81053f0c3cb80ec3ff Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Mon, 28 Oct 2024 16:53:20 -0700 Subject: [PATCH 5/9] fix lint Signed-off-by: Linkun Chen --- benchmarks/benchmark_throughput.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index a4489b2fa266d..fafc7d2f96087 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -4,7 +4,7 @@ import json import random import time -from typing import List, Optional, Tuple +from typing import List, Optional import torch from PIL import Image @@ -28,7 +28,8 @@ class SampleRequest: Attributes: prompt: The input text prompt for the model. - multi_modal_data: Optional dictionary containing multi-modal data (e.g., images). + multi_modal_data: Optional dictionary containing multi-modal data (e.g. + images). prompt_len: The length of the prompt in tokens. expected_output_len: The expected length of the output in tokens. """ From 6cb2fa7c81cfe790dbf26106b6631fafcc51b651 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Mon, 28 Oct 2024 16:58:48 -0700 Subject: [PATCH 6/9] fix lint Signed-off-by: Linkun Chen --- benchmarks/benchmark_throughput.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index fafc7d2f96087..98526e0a3f364 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -81,12 +81,13 @@ def sample_requests( SampleRequest(prompt=prompt, prompt_len=prompt_len, expected_output_len=output_len)) - print(data) if "image" in data: - filtered_dataset[-1].multi_modal_data = filtered_dataset[-1].multi_modal_data or {} + filtered_dataset[-1].multi_modal_data = filtered_dataset[ + -1].multi_modal_data or {} image_path = data["image"] assert isinstance(image_path, str) - filtered_dataset[-1].multi_modal_data["image"] = Image.open(image_path).convert("RGB") + filtered_dataset[-1].multi_modal_data["image"] = Image.open( + image_path).convert("RGB") return filtered_dataset @@ -103,8 +104,9 @@ def run_vllm( prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] for request in requests: - prompts.append(TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data)) - print(prompts[-1]) + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) sampling_params.append( SamplingParams( n=n, From a0199b52c5dcdee6f996aa91fd8f5ebcf7e38d11 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Tue, 29 Oct 2024 12:21:26 -0700 Subject: [PATCH 7/9] fix sort Signed-off-by: Linkun Chen --- benchmarks/benchmark_throughput.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 98526e0a3f364..87c42a15e1249 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -7,8 +7,8 @@ from typing import List, Optional import torch -from PIL import Image import uvloop +from PIL import Image from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) From eb6e01bf62996b246ee498f1651df255de780ed8 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Tue, 29 Oct 2024 13:41:25 -0700 Subject: [PATCH 8/9] Revert "Update benchmark_throughput.py to support image input" This reverts commit 2623fea07f9af320e9d823f3c924241e86750f42. Signed-off-by: Linkun Chen --- benchmarks/benchmark_throughput.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 87c42a15e1249..32c3b54c9651f 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -8,7 +8,6 @@ import torch import uvloop -from PIL import Image from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) @@ -53,20 +52,23 @@ def sample_requests( dataset = json.load(f) # Filter out the conversations with less than 2 turns. dataset = [data for data in dataset if len(data["conversations"]) >= 2] + # Only keep the first two turns of each conversation. + dataset = [(data["conversations"][0]["value"], + data["conversations"][1]["value"]) for data in dataset] + # Shuffle the dataset. random.shuffle(dataset) # Filter out sequences that are too long or too short filtered_dataset: List[SampleRequest] = [] - for data in dataset: + for i in range(len(dataset)): if len(filtered_dataset) == num_requests: break - # Only keep the first two turns of each conversation. - prompt = data["conversations"][0]["value"] - completion = data["conversations"][1]["value"] # Tokenize the prompts and completions. + prompt = dataset[i][0] prompt_token_ids = tokenizer(prompt).input_ids + completion = dataset[i][1] completion_token_ids = tokenizer(completion).input_ids prompt_len = len(prompt_token_ids) output_len = len(completion_token_ids @@ -81,13 +83,6 @@ def sample_requests( SampleRequest(prompt=prompt, prompt_len=prompt_len, expected_output_len=output_len)) - if "image" in data: - filtered_dataset[-1].multi_modal_data = filtered_dataset[ - -1].multi_modal_data or {} - image_path = data["image"] - assert isinstance(image_path, str) - filtered_dataset[-1].multi_modal_data["image"] = Image.open( - image_path).convert("RGB") return filtered_dataset @@ -104,9 +99,7 @@ def run_vllm( prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] for request in requests: - prompts.append( - TextPrompt(prompt=request.prompt, - multi_modal_data=request.multi_modal_data)) + prompts.append(TextPrompt(prompt=request.prompt)) sampling_params.append( SamplingParams( n=n, From 103225bc392f01b119bf025843254ae5c954007f Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Mon, 4 Nov 2024 10:32:31 -0800 Subject: [PATCH 9/9] Use named struct (instead of tuple) if dataset not provided Signed-off-by: Linkun Chen --- benchmarks/benchmark_throughput.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 32c3b54c9651f..262b8652e49ff 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -265,8 +265,12 @@ def main(args: argparse.Namespace): else: raise ValueError( f"Failed to synthesize a prompt with {args.input_len} tokens.") - requests = [(prompt, args.input_len, args.output_len) - for _ in range(args.num_prompts)] + requests = [ + SampleRequest(prompt=prompt, + prompt_len=args.input_len, + expected_output_len=args.output_len) + for _ in range(args.num_prompts) + ] else: requests = sample_requests(args.dataset, args.num_prompts, tokenizer, args.output_len)