Only switch to chunked dg moe when num_rows is greater than self.moe_max_num_tokens * 2.

lfr-0531 · lfr-0531 · commit 2f94ddf3a20f · 2025-08-11T22:17:22.000-07:00
Signed-off-by: Fanrong Li &lt;23290157+lfr-0531@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -110,7 +110,6 @@ def __init__(
         assert len(
             self.initial_local_expert_ids) == self.expert_size_per_partition
 
-        max_num_tokens = model_config.max_num_tokens
         # The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
         moe_max_num_tokens = model_config.max_num_tokens * model_config.mapping.dp_size
         self.moe_max_num_tokens = model_config.moe_max_num_tokens or moe_max_num_tokens
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
@@ -339,7 +339,9 @@ def __init__(
             # It can avoid OOM for 8k/1k cases.
             default_moe_max_num_tokens = 18688
             if moe_max_num_tokens > default_moe_max_num_tokens:
+                model_config._frozen = False
                 model_config.moe_max_num_tokens = default_moe_max_num_tokens
+                model_config._frozen = True
 
         super().__init__(
             routing_method=routing_method,
@@ -600,9 +602,12 @@ def forward(
         else:
             num_rows = x.shape[0]
 
-        # in case of num_rows is larger than max_chunk_size, we need to split the input into multiple chunks
-        num_chunks = (num_rows + self.moe_max_num_tokens -
-                      1) // self.moe_max_num_tokens
+        # In case of num_rows is larger than max_chunk_size * 2, we need to split the input into multiple chunks.
+        # Because we will use two streams in chunked moe and preallocate two workspaces.
+        num_chunks = 1
+        if num_rows > self.moe_max_num_tokens * 2:
+            num_chunks = (num_rows + self.moe_max_num_tokens -
+                          1) // self.moe_max_num_tokens
 
         if use_dp_padding:
             all_rank_num_tokens_padded = [all_rank_max_num_tokens
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py
@@ -81,7 +81,6 @@ def __init__(
             self.num_experts)
         self.expert_size_per_partition = self.expert_end - self.expert_start
 
-        max_num_tokens = model_config.max_num_tokens
         # The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
         moe_max_num_tokens = model_config.max_num_tokens * model_config.mapping.dp_size
         self.moe_max_num_tokens = model_config.moe_max_num_tokens or moe_max_num_tokens
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -150,7 +150,6 @@ def __init__(
         assert len(
             self.initial_local_expert_ids) == self.expert_size_per_partition
 
-        max_num_tokens = model_config.max_num_tokens
         # The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
         moe_max_num_tokens = model_config.max_num_tokens * model_config.mapping.dp_size
         self.moe_max_num_tokens = model_config.moe_max_num_tokens or moe_max_num_tokens
diff --git a/tensorrt_llm/mapping.py b/tensorrt_llm/mapping.py
@@ -372,6 +372,10 @@ def node_rank(self):
     def local_rank(self):
         return self.rank % self.gpus_per_node
 
+    @property
+    def dp_size(self):
+        return self.tp_size if self.enable_attention_dp else 1
+
     def has_cp(self):
         return self.cp_size > 1