From 3c86a032214e985d7be1d2b292fa8382f234912c Mon Sep 17 00:00:00 2001
From: lcskrishna <lollachaitanya@gmail.com>
Date: Wed, 19 Jun 2024 14:39:49 +0000
Subject: [PATCH] fix 8k issue by changing max-context/seq len to 32k

---
 vllm/engine/arg_utils.py | 2 +-
 vllm/entrypoints/llm.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 9fe0d0bb0a301..04b9e8032c0cf 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -56,7 +56,7 @@ class EngineArgs:
     quantization: Optional[str] = None
     enforce_eager: bool = False
     max_context_len_to_capture: Optional[int] = None
-    max_seq_len_to_capture: int = 8192
+    max_seq_len_to_capture: int = 32768
     disable_custom_all_reduce: bool = False
     tokenizer_pool_size: int = 0
     tokenizer_pool_type: str = "ray"
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 6e971ae73f5d0..bbb5d31f0606a 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -115,7 +115,7 @@ def __init__(
         swap_space: int = 4,
         enforce_eager: bool = False,
         max_context_len_to_capture: Optional[int] = None,
-        max_seq_len_to_capture: int = 8192,
+        max_seq_len_to_capture: int = 32768,
         disable_custom_all_reduce: bool = False,
         **kwargs,
     ) -> None: