Temporarily revert "Fix 8K decode latency jump issue. " (#108)

This change has the unintended side effect of making sure Paged Attention V1 is never called in graph mode. The correct fix would bring the decode kernel selection before the graph capture point but this necessitates a huge refactor.
ROCm · Jul 26, 2024 · 304efb3 · 304efb3
1 parent c73c75d
commit 304efb3
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 2 deletions.
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -57,7 +57,7 @@ class EngineArgs:
     quantized_weights_path: Optional[str] = None
     enforce_eager: bool = False
     max_context_len_to_capture: Optional[int] = None
-    max_seq_len_to_capture: int = 32768
+    max_seq_len_to_capture: int = 8192
     disable_custom_all_reduce: bool = False
     tokenizer_pool_size: int = 0
     tokenizer_pool_type: str = "ray"

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -115,7 +115,7 @@ def __init__(
         swap_space: int = 4,
         enforce_eager: bool = False,
         max_context_len_to_capture: Optional[int] = None,
-        max_seq_len_to_capture: int = 32768,
+        max_seq_len_to_capture: int = 8192,
         disable_custom_all_reduce: bool = False,
         **kwargs,
     ) -> None: