diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py index 4b32f7f476b..392fff09115 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py @@ -411,7 +411,7 @@ def __init__( def get_workspace(self, m_max: int, group_size: int): hidden_size = self.hidden_size - intermediate_size = self.intermediate_size + intermediate_size = self.intermediate_size_per_partition num_experts = self.expert_size_per_partition # create workspace @@ -564,7 +564,7 @@ def forward_chunk( # grouped gemm 1 h1 = set_strides(workspace["workspace_1"], self.expert_size_per_partition, m_max, - self.intermediate_size * 2) + self.intermediate_size_per_partition * 2) deepgemm_fp8_group_blockwise_gemm( d=h1, @@ -579,9 +579,9 @@ def forward_chunk( # activation and quantization act_input_fp8 = set_strides(workspace["workspace_0"], self.expert_size_per_partition, m_max, - self.intermediate_size) + self.intermediate_size_per_partition) - scale_k = fp8_utils.ceil_div(self.intermediate_size, 128) + scale_k = fp8_utils.ceil_div(self.intermediate_size_per_partition, 128) scale_k_padded = fp8_utils.align(scale_k, 4) act_input_sf = set_strides(workspace["workspace_sf"], self.expert_size_per_partition, diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 2920f5b6972..1e359728ac8 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -583,7 +583,15 @@ def get_cuda_graph_warmup_request(batch_size, draft_len): # Add one dummy request with the maximum possible sequence length. # The sequence length is limited by both the max_seq_len and the number of available blocks. + # Also, the sequence length is limited by the max_position_embeddings. token_num = max(1, min(available_tokens, self.max_seq_len - 1)) + model_config = self.model.model_config.pretrained_config + max_position_embeddings = getattr(model_config, + 'max_position_embeddings', + None) + if max_position_embeddings is not None: + token_num = min(token_num, + max_position_embeddings - draft_len) max_seq_len_request = kv_cache_manager.add_dummy_requests( request_ids=[batch_size - 1], token_nums=[token_num],