all2all check from both chunking and threshold

ttyio · ttyio · commit 35ba966ca5e8 · 2025-07-14T08:50:04.000-07:00
Signed-off-by: Vincent Huang &lt;vincenth@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -512,11 +512,16 @@ def compute_routed_output(self, hidden_states, hidden_states_fp4,
         # max-throughput
         use_dp_padding = False
         if self.use_dp and self.mapping.tp_size > 1:
+            # MoE use static heuristic to check alltoall enabled or not, however, for wide_ep, the alltoall could also be dynamically disabled when chunking is used or TRTLLM_DEEP_EP_TOKEN_LIMIT is hit.
+            is_wide_ep_alltoall_disabled = isinstance(
+                self.experts, WideEPMoE) and not self.experts.can_use_alltoall(
+                    hidden_states, all_rank_num_tokens)
+            alltoall_enabled = self.experts.enable_alltoall and not is_wide_ep_alltoall_disabled
+
             # FP4 all_gather moves this bf16 allgather in to after topk and fp4 quantization
             # to reduce allreduce BW
-            if (disable_fp4_allgather()
-                    and not self.experts.can_use_alltoall(hidden_states_fp4 or hidden_states)) or isinstance(
-                        self.experts, TRTLLMGenFusedMoE):
+            if (disable_fp4_allgather() and not alltoall_enabled) or isinstance(
+                    self.experts, TRTLLMGenFusedMoE):
                 hidden_states = allgather(hidden_states,
                                           self.mapping,
                                           dim=0,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -278,7 +278,16 @@ def enable_alltoall(self):
         """
         return self.alltoall_method_type != AlltoallMethodType.NotEnabled
 
-    def can_use_alltoall(self, input):
+    def calculate_num_chunks(self, all_rank_num_tokens: List[int]) -> int:
+        num_rows = sum(all_rank_num_tokens)
+        return (num_rows + self.moe_max_num_tokens -
+                1) // self.moe_max_num_tokens
+
+    def can_use_alltoall(self, input, all_rank_num_tokens):
+        # Disable alltoall when chunking is used
+        if self.calculate_num_chunks(all_rank_num_tokens) > 1:
+            return False
+
         num_tokens = input.shape[0]
 
         # For DeepEPLowLatency, check if tokens exceed the threshold
@@ -521,7 +530,7 @@ def forward_chunk(
                     f"unsupported quantization mode: {self.quant_config.quant_mode}"
                 )
 
-        if use_allgather and not use_all_to_all:
+        if use_allgather:
             # using allgather case.
             if self.enable_dummy_allreduce:
                 self.dummy_allreduce()
@@ -766,20 +775,17 @@ def forward(
     ) -> torch.Tensor:
         assert all_rank_num_tokens is not None
         assert use_dp_padding is not None
-        num_rows = sum(all_rank_num_tokens)
 
         # in case of num_rows is larger than max_chunk_size, we need to split the input into multiple chunks
-        num_chunks = (num_rows + self.moe_max_num_tokens -
-                      1) // self.moe_max_num_tokens
+        num_chunks = self.calculate_num_chunks(all_rank_num_tokens)
+        use_all_to_all = self.can_use_alltoall(x, all_rank_num_tokens)
 
         if use_dp_padding:
             all_rank_num_tokens_padded = [all_rank_max_num_tokens
                                           ] * len(all_rank_num_tokens)
         else:
             all_rank_num_tokens_padded = all_rank_num_tokens
         if num_chunks == 1:
-            use_all_to_all = self.can_use_alltoall(x)
-
             is_first_call = self.repeat_idx == 0
             is_last_call = self.repeat_idx == self.repeat_count - 1
             outputs = self.forward_chunk(
@@ -798,8 +804,6 @@ def forward(
                 use_dp_padding=use_dp_padding)
         else:
 
-            use_all_to_all = False
-
             def split_chunk(split_token_num: int, split_num_chunks: int):
                 val_div = split_token_num // split_num_chunks
                 val_mod = split_token_num % split_num_chunks