@@ -122,7 +122,7 @@ class EngineArgs:
122
122
cpu_offload_gb : float = 0 # GiB
123
123
gpu_memory_utilization : float = 0.90
124
124
max_num_batched_tokens : Optional [int ] = None
125
- max_num_seqs : int = 256
125
+ max_num_seqs : Optional [ int ] = None
126
126
max_logprobs : int = 20 # Default value for OpenAI Chat Completions API
127
127
disable_log_stats : bool = False
128
128
revision : Optional [str ] = None
@@ -205,6 +205,9 @@ def __post_init__(self):
205
205
# by user.
206
206
if self .enable_prefix_caching is None :
207
207
self .enable_prefix_caching = bool (envs .VLLM_USE_V1 )
208
+ # Override max_num_seqs if it's not set by user.
209
+ if self .max_num_seqs is None :
210
+ self .max_num_seqs = 256 if not envs .VLLM_USE_V1 else 1024
208
211
209
212
# support `EngineArgs(compilation_config={...})`
210
213
# without having to manually construct a
@@ -1225,19 +1228,19 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
1225
1228
"""
1226
1229
assert envs .VLLM_USE_V1 , "V1 is not enabled"
1227
1230
1231
+ # V1 always uses chunked prefills.
1232
+ self .enable_chunked_prefill = True
1233
+ # When no user override, set the default values based on the usage
1234
+ # context.
1235
+ # TODO(woosuk): Tune the default values for different hardware.
1228
1236
if self .max_num_batched_tokens is None :
1229
- # When no user override, set the default values based on the
1230
- # usage context.
1231
1237
if usage_context == UsageContext .LLM_CLASS :
1232
- logger .warning ("Setting max_num_batched_tokens to 8192 "
1233
- "for LLM_CLASS usage context." )
1234
- self .max_num_seqs = 1024
1235
1238
self .max_num_batched_tokens = 8192
1236
1239
elif usage_context == UsageContext .OPENAI_API_SERVER :
1237
- logger .warning ("Setting max_num_batched_tokens to 2048 "
1238
- "for OPENAI_API_SERVER usage context." )
1239
- self .max_num_seqs = 1024
1240
1240
self .max_num_batched_tokens = 2048
1241
+ logger .warning (
1242
+ "Setting max_num_batched_tokens to %d for %s usage context." ,
1243
+ self .max_num_batched_tokens , usage_context .value )
1241
1244
1242
1245
def _override_v1_engine_config (self , engine_config : VllmConfig ) -> None :
1243
1246
"""
0 commit comments