Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions tensorrt_llm/_torch/pyexecutor/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ def __init__(self, *, executor_config: ExecutorConfig,
self._draft_model_engine = draft_model_engine
self._mapping = mapping
self._max_kv_tokens_in = self._executor_config.kv_cache_config.max_tokens
self._dummy_reqs = self._create_dummy_context_requests(net_max_seq_len -
1)
self._dummy_reqs = None
self._max_seq_len = net_max_seq_len

@staticmethod
def _get_cache_size_per_token(model_config: ModelConfig,
Expand Down Expand Up @@ -168,6 +168,10 @@ def _get_token_num_for_estimation(self) -> int:
if spec_cfg is not None:
num_extra_tokens_per_seq += spec_cfg.max_draft_len
num_extra_tokens_per_seq += get_num_extra_kv_tokens(spec_cfg)

if self._dummy_reqs is None:
self._dummy_reqs = self._create_dummy_context_requests(
max(1, self._max_seq_len - 1))
for req in self._dummy_reqs:
num_req_tokens = len(req.input_token_ids) + num_extra_tokens_per_seq
# Requests cannot share KV cache blocks. Round up to nearest integer multiple of block size.
Expand Down Expand Up @@ -381,6 +385,10 @@ def _create_kv_cache_manager(
if model_engine.kv_cache_manager_key == ResourceManagerType.KV_CACHE_MANAGER:
executor_config.max_seq_len = kv_cache_manager.max_seq_len

# When SWA is enabled, max_seq_len is updated inside kv_cache_manager.
if kv_cache_manager is not None:
self._max_seq_len = kv_cache_manager.max_seq_len

return kv_cache_manager

def build_managers(self, resources: Dict) -> None:
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/_torch/pyexecutor/resource_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ def calculate_max_num_blocks(self,
if kv_cache_config.free_gpu_memory_fraction is not None:
max_tokens = min(kv_cache_config.max_tokens, max_tokens)
logger.warning(
f'Both free_gpu_memory_fraction and max_tokens are set (to {free_mem_fraction} and {kv_cache_config.max_tokens}, respectively). The smaller value will be used.'
f'Both free_gpu_memory_fraction and max_tokens are set (to {free_mem_fraction} and {max_tokens} with free memory {free_mem / (1 << 32)} of total memory {total_mem / (1<<32)}, respectively). The smaller value will be used.'
)
else:
max_tokens = kv_cache_config.max_tokens
Expand Down
Loading