Merge pull request #55 from ROCm/cl/fix-8k-issue

Fix 8K decode latency jump issue.
ROCm · Jun 19, 2024 · 719bf9d · 719bf9d
2 parents 131b217 + 3c86a03
commit 719bf9d
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 2 deletions.
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -56,7 +56,7 @@ class EngineArgs:
     quantization: Optional[str] = None
     enforce_eager: bool = False
     max_context_len_to_capture: Optional[int] = None
-    max_seq_len_to_capture: int = 8192
+    max_seq_len_to_capture: int = 32768
     disable_custom_all_reduce: bool = False
     tokenizer_pool_size: int = 0
     tokenizer_pool_type: str = "ray"

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -115,7 +115,7 @@ def __init__(
         swap_space: int = 4,
         enforce_eager: bool = False,
         max_context_len_to_capture: Optional[int] = None,
-        max_seq_len_to_capture: int = 8192,
+        max_seq_len_to_capture: int = 32768,
         disable_custom_all_reduce: bool = False,
         **kwargs,
     ) -> None: