Skip to content

Commit

Permalink
Temporarily revert "Fix 8K decode latency jump issue. " (#108)
Browse files Browse the repository at this point in the history
This change has the unintended side effect of making sure Paged Attention V1 is never called in graph mode. The correct fix would bring the decode kernel selection before the graph capture point but this necessitates a huge refactor.
  • Loading branch information
mawong-amd authored Jul 26, 2024
1 parent c73c75d commit 304efb3
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 2 deletions.
2 changes: 1 addition & 1 deletion vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class EngineArgs:
quantized_weights_path: Optional[str] = None
enforce_eager: bool = False
max_context_len_to_capture: Optional[int] = None
max_seq_len_to_capture: int = 32768
max_seq_len_to_capture: int = 8192
disable_custom_all_reduce: bool = False
tokenizer_pool_size: int = 0
tokenizer_pool_type: str = "ray"
Expand Down
2 changes: 1 addition & 1 deletion vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def __init__(
swap_space: int = 4,
enforce_eager: bool = False,
max_context_len_to_capture: Optional[int] = None,
max_seq_len_to_capture: int = 32768,
max_seq_len_to_capture: int = 8192,
disable_custom_all_reduce: bool = False,
**kwargs,
) -> None:
Expand Down

0 comments on commit 304efb3

Please sign in to comment.