diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3db069ec64ee4..7b9adc401abcf 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -122,7 +122,7 @@ class EngineArgs: cpu_offload_gb: float = 0 # GiB gpu_memory_utilization: float = 0.90 max_num_batched_tokens: Optional[int] = None - max_num_seqs: int = 256 + max_num_seqs: Optional[int] = None max_logprobs: int = 20 # Default value for OpenAI Chat Completions API disable_log_stats: bool = False revision: Optional[str] = None @@ -205,6 +205,9 @@ def __post_init__(self): # by user. if self.enable_prefix_caching is None: self.enable_prefix_caching = bool(envs.VLLM_USE_V1) + # Override max_num_seqs if it's not set by user. + if self.max_num_seqs is None: + self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024 # support `EngineArgs(compilation_config={...})` # without having to manually construct a @@ -1225,19 +1228,19 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None: """ assert envs.VLLM_USE_V1, "V1 is not enabled" + # V1 always uses chunked prefills. + self.enable_chunked_prefill = True + # When no user override, set the default values based on the usage + # context. + # TODO(woosuk): Tune the default values for different hardware. if self.max_num_batched_tokens is None: - # When no user override, set the default values based on the - # usage context. if usage_context == UsageContext.LLM_CLASS: - logger.warning("Setting max_num_batched_tokens to 8192 " - "for LLM_CLASS usage context.") - self.max_num_seqs = 1024 self.max_num_batched_tokens = 8192 elif usage_context == UsageContext.OPENAI_API_SERVER: - logger.warning("Setting max_num_batched_tokens to 2048 " - "for OPENAI_API_SERVER usage context.") - self.max_num_seqs = 1024 self.max_num_batched_tokens = 2048 + logger.warning( + "Setting max_num_batched_tokens to %d for %s usage context.", + self.max_num_batched_tokens, usage_context.value) def _override_v1_engine_config(self, engine_config: VllmConfig) -> None: """