diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index 5f635f47697..44ef78fb4ad 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -50,8 +50,8 @@ def __init__(self, *, executor_config: ExecutorConfig, self._draft_model_engine = draft_model_engine self._mapping = mapping self._max_kv_tokens_in = self._executor_config.kv_cache_config.max_tokens - self._dummy_reqs = self._create_dummy_context_requests(net_max_seq_len - - 1) + self._dummy_reqs = None + self._max_seq_len = net_max_seq_len @staticmethod def _get_cache_size_per_token(model_config: ModelConfig, @@ -168,6 +168,10 @@ def _get_token_num_for_estimation(self) -> int: if spec_cfg is not None: num_extra_tokens_per_seq += spec_cfg.max_draft_len num_extra_tokens_per_seq += get_num_extra_kv_tokens(spec_cfg) + + if self._dummy_reqs is None: + self._dummy_reqs = self._create_dummy_context_requests( + max(1, self._max_seq_len - 1)) for req in self._dummy_reqs: num_req_tokens = len(req.input_token_ids) + num_extra_tokens_per_seq # Requests cannot share KV cache blocks. Round up to nearest integer multiple of block size. @@ -381,6 +385,10 @@ def _create_kv_cache_manager( if model_engine.kv_cache_manager_key == ResourceManagerType.KV_CACHE_MANAGER: executor_config.max_seq_len = kv_cache_manager.max_seq_len + # When SWA is enabled, max_seq_len is updated inside kv_cache_manager. + if kv_cache_manager is not None: + self._max_seq_len = kv_cache_manager.max_seq_len + return kv_cache_manager def build_managers(self, resources: Dict) -> None: diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index 5b1dd3e2091..1c5805bc40e 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -485,7 +485,7 @@ def calculate_max_num_blocks(self, if kv_cache_config.free_gpu_memory_fraction is not None: max_tokens = min(kv_cache_config.max_tokens, max_tokens) logger.warning( - f'Both free_gpu_memory_fraction and max_tokens are set (to {free_mem_fraction} and {kv_cache_config.max_tokens}, respectively). The smaller value will be used.' + f'Both free_gpu_memory_fraction and max_tokens are set (to {free_mem_fraction} and {max_tokens} with free memory {free_mem / (1 << 32)} of total memory {total_mem / (1<<32)}, respectively). The smaller value will be used.' ) else: max_tokens = kv_cache_config.max_tokens