Skip to content

Commit 50de822

Browse files
HuiGao-NVdominicshanshan
authored andcommitted
[https://nvbugs/5474169][fix]Adjust max seq len for kvcache for memory estimation (NVIDIA#7391)
Signed-off-by: Hui Gao <[email protected]> Signed-off-by: Wangshanshan <[email protected]>
1 parent f8fa375 commit 50de822

File tree

2 files changed

+11
-3
lines changed

2 files changed

+11
-3
lines changed

tensorrt_llm/_torch/pyexecutor/_util.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,14 +67,14 @@ def __init__(
6767
self._max_kv_tokens_in = self._kv_cache_config.max_tokens
6868
self._max_num_tokens = max_num_tokens
6969
self._max_beam_width = max_beam_width
70-
self._dummy_reqs = self._create_dummy_context_requests(net_max_seq_len -
71-
1)
7270
self._kv_connector_manager = kv_connector_manager
7371
self._pytorch_backend_config = pytorch_backend_config
7472
self._speculative_config = speculative_config
7573
self._tokens_per_block = tokens_per_block
7674
self._max_seq_len = max_seq_len
7775
self._max_batch_size = max_batch_size
76+
self._dummy_reqs = self._create_dummy_context_requests(net_max_seq_len -
77+
1)
7878

7979
@staticmethod
8080
def _get_cache_size_per_token(model_config: ModelConfig,
@@ -196,6 +196,10 @@ def _get_token_num_for_estimation(self) -> int:
196196
if spec_cfg is not None:
197197
num_extra_tokens_per_seq += spec_cfg.max_draft_len
198198
num_extra_tokens_per_seq += get_num_extra_kv_tokens(spec_cfg)
199+
200+
if self._dummy_reqs is None:
201+
self._dummy_reqs = self._create_dummy_context_requests(
202+
max(1, self.net_max_seq_len - 1))
199203
for req in self._dummy_reqs:
200204
num_req_tokens = len(req.input_token_ids) + num_extra_tokens_per_seq
201205
# Requests cannot share KV cache blocks. Round up to nearest integer multiple of block size.
@@ -480,6 +484,10 @@ def _create_kv_cache_manager(
480484
if model_engine.kv_cache_manager_key == ResourceManagerType.KV_CACHE_MANAGER:
481485
self._max_seq_len = kv_cache_manager.max_seq_len
482486

487+
# When SWA is enabled, max_seq_len is updated inside kv_cache_manager.
488+
if kv_cache_manager is not None:
489+
self._max_seq_len = kv_cache_manager.max_seq_len
490+
483491
return kv_cache_manager
484492

485493
def build_managers(self,

tensorrt_llm/_torch/pyexecutor/resource_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -582,7 +582,7 @@ def calculate_max_num_blocks(self,
582582
if kv_cache_config.free_gpu_memory_fraction is not None:
583583
max_tokens = min(kv_cache_config.max_tokens, max_tokens)
584584
logger.warning(
585-
f'Both free_gpu_memory_fraction and max_tokens are set (to {free_mem_fraction} and {kv_cache_config.max_tokens}, respectively). The smaller value will be used.'
585+
f'Both free_gpu_memory_fraction and max_tokens are set (to {free_mem_fraction} and {max_tokens} with free memory {free_mem / (1 << 32)} of total memory {total_mem / (1<<32)}, respectively). The smaller value will be used.'
586586
)
587587
else:
588588
max_tokens = kv_cache_config.max_tokens

0 commit comments

Comments
 (0)