Skip to content

Commit

Permalink
[V1] Increase default batch size for H100/H200 (vllm-project#12369)
Browse files Browse the repository at this point in the history
Signed-off-by: Woosuk Kwon <[email protected]>
  • Loading branch information
WoosukKwon authored Jan 24, 2025
1 parent 55ef66e commit 0e74d79
Showing 1 changed file with 16 additions and 5 deletions.
21 changes: 16 additions & 5 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1279,11 +1279,22 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
self.enable_chunked_prefill = True
# When no user override, set the default values based on the usage
# context.
# TODO(woosuk): Tune the default values for different hardware.
default_max_num_batched_tokens = {
UsageContext.LLM_CLASS: 8192,
UsageContext.OPENAI_API_SERVER: 2048,
}
# Use different default values for different hardware.
from vllm.platforms import current_platform
device_name = current_platform.get_device_name().lower()
if "h100" in device_name or "h200" in device_name:
# For H100 and H200, we use larger default values.
default_max_num_batched_tokens = {
UsageContext.LLM_CLASS: 16384,
UsageContext.OPENAI_API_SERVER: 8192,
}
else:
# TODO(woosuk): Tune the default values for other hardware.
default_max_num_batched_tokens = {
UsageContext.LLM_CLASS: 8192,
UsageContext.OPENAI_API_SERVER: 2048,
}

if (self.max_num_batched_tokens is None
and usage_context in default_max_num_batched_tokens):
self.max_num_batched_tokens = default_max_num_batched_tokens[
Expand Down

0 comments on commit 0e74d79

Please sign in to comment.