diff --git a/vllm/config.py b/vllm/config.py index e9559c40dbdfb..c2a8c956b374a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -84,9 +84,6 @@ class ModelConfig: disable CUDA graph and always execute the model in eager mode. If False, we will use CUDA graph and eager execution in hybrid. If None, the user did not specify, so default to False. - max_context_len_to_capture: Maximum context len covered by CUDA graphs. - When a sequence has context length larger than this, we fall back - to eager mode (DEPRECATED. Use max_seq_len_to_capture instead). max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. Additionally for encoder-decoder models, if the @@ -147,7 +144,6 @@ def __init__( quantization: Optional[str] = None, quantization_param_path: Optional[str] = None, enforce_eager: Optional[bool] = None, - max_context_len_to_capture: Optional[int] = None, max_seq_len_to_capture: Optional[int] = None, max_logprobs: int = 20, disable_sliding_window: bool = False, @@ -181,9 +177,6 @@ def __init__( self.quantization = quantization self.quantization_param_path = quantization_param_path self.enforce_eager = enforce_eager - if max_context_len_to_capture is not None: - raise ValueError("`max_context_len_to_capture` is deprecated. " - "Use `max_seq_len_to_capture` instead.") self.max_seq_len_to_capture = max_seq_len_to_capture self.max_logprobs = max_logprobs self.disable_sliding_window = disable_sliding_window diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index de886c98e51bd..b1f0f8b9df925 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -126,7 +126,6 @@ class EngineArgs: tokenizer_revision: Optional[str] = None quantization: Optional[str] = None enforce_eager: Optional[bool] = None - max_context_len_to_capture: Optional[int] = None max_seq_len_to_capture: int = 8192 disable_custom_all_reduce: bool = False tokenizer_pool_size: int = 0 @@ -504,14 +503,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help='Always use eager-mode PyTorch. If False, ' 'will use eager mode and CUDA graph in hybrid ' 'for maximal performance and flexibility.') - parser.add_argument('--max-context-len-to-capture', - type=int, - default=EngineArgs.max_context_len_to_capture, - help='Maximum context length covered by CUDA ' - 'graphs. When a sequence has context length ' - 'larger than this, we fall back to eager mode. ' - '(DEPRECATED. Use --max-seq-len-to-capture instead' - ')') parser.add_argument('--max-seq-len-to-capture', type=int, default=EngineArgs.max_seq_len_to_capture, @@ -939,7 +930,6 @@ def create_model_config(self) -> ModelConfig: quantization=self.quantization, quantization_param_path=self.quantization_param_path, enforce_eager=self.enforce_eager, - max_context_len_to_capture=self.max_context_len_to_capture, max_seq_len_to_capture=self.max_seq_len_to_capture, max_logprobs=self.max_logprobs, disable_sliding_window=self.disable_sliding_window, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 083b67c2f8e7d..3d62cb3598477 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -93,9 +93,6 @@ class LLM: enforce_eager: Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model in eager mode. If False, we will use CUDA graph and eager execution in hybrid. - max_context_len_to_capture: Maximum context len covered by CUDA graphs. - When a sequence has context length larger than this, we fall back - to eager mode (DEPRECATED. Use `max_seq_len_to_capture` instead). max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. Additionally for encoder-decoder models, if the @@ -152,7 +149,6 @@ def __init__( swap_space: float = 4, cpu_offload_gb: float = 0, enforce_eager: Optional[bool] = None, - max_context_len_to_capture: Optional[int] = None, max_seq_len_to_capture: int = 8192, disable_custom_all_reduce: bool = False, disable_async_output_proc: bool = False, @@ -193,7 +189,6 @@ def __init__( swap_space=swap_space, cpu_offload_gb=cpu_offload_gb, enforce_eager=enforce_eager, - max_context_len_to_capture=max_context_len_to_capture, max_seq_len_to_capture=max_seq_len_to_capture, disable_custom_all_reduce=disable_custom_all_reduce, disable_async_output_proc=disable_async_output_proc, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 233a9e664d845..891637dafbb14 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -995,7 +995,7 @@ def __init__( # Python can be expensive. To optimize this, we cache the block table # in numpy and only copy the actual input content at every iteration. # The shape of the cached block table will be - # (max batch size to capture, max context len to capture / block size). + # (max batch size to capture, max seq len to capture / block size). self.graph_block_tables = np.zeros( (self.max_batchsize_to_capture, self.get_max_block_per_batch()), dtype=np.int32)