Skip to content

Commit be1b7f0

Browse files
WoosukKwonAkshat-Tripathi
authored andcommitted
[V1][Bugfix] Always set enable_chunked_prefill = True for V1 (vllm-project#11061)
Signed-off-by: Woosuk Kwon <[email protected]> Signed-off-by: Akshat Tripathi <[email protected]>
1 parent 3e734b3 commit be1b7f0

File tree

1 file changed

+12
-9
lines changed

1 file changed

+12
-9
lines changed

vllm/engine/arg_utils.py

+12-9
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ class EngineArgs:
122122
cpu_offload_gb: float = 0 # GiB
123123
gpu_memory_utilization: float = 0.90
124124
max_num_batched_tokens: Optional[int] = None
125-
max_num_seqs: int = 256
125+
max_num_seqs: Optional[int] = None
126126
max_logprobs: int = 20 # Default value for OpenAI Chat Completions API
127127
disable_log_stats: bool = False
128128
revision: Optional[str] = None
@@ -205,6 +205,9 @@ def __post_init__(self):
205205
# by user.
206206
if self.enable_prefix_caching is None:
207207
self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
208+
# Override max_num_seqs if it's not set by user.
209+
if self.max_num_seqs is None:
210+
self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024
208211

209212
# support `EngineArgs(compilation_config={...})`
210213
# without having to manually construct a
@@ -1225,19 +1228,19 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
12251228
"""
12261229
assert envs.VLLM_USE_V1, "V1 is not enabled"
12271230

1231+
# V1 always uses chunked prefills.
1232+
self.enable_chunked_prefill = True
1233+
# When no user override, set the default values based on the usage
1234+
# context.
1235+
# TODO(woosuk): Tune the default values for different hardware.
12281236
if self.max_num_batched_tokens is None:
1229-
# When no user override, set the default values based on the
1230-
# usage context.
12311237
if usage_context == UsageContext.LLM_CLASS:
1232-
logger.warning("Setting max_num_batched_tokens to 8192 "
1233-
"for LLM_CLASS usage context.")
1234-
self.max_num_seqs = 1024
12351238
self.max_num_batched_tokens = 8192
12361239
elif usage_context == UsageContext.OPENAI_API_SERVER:
1237-
logger.warning("Setting max_num_batched_tokens to 2048 "
1238-
"for OPENAI_API_SERVER usage context.")
1239-
self.max_num_seqs = 1024
12401240
self.max_num_batched_tokens = 2048
1241+
logger.warning(
1242+
"Setting max_num_batched_tokens to %d for %s usage context.",
1243+
self.max_num_batched_tokens, usage_context.value)
12411244

12421245
def _override_v1_engine_config(self, engine_config: VllmConfig) -> None:
12431246
"""

0 commit comments

Comments
 (0)