|
62 | 62 | from ..modules.rms_norm import RMSNorm
|
63 | 63 | from ..peft.lora.layer import LoraLayer
|
64 | 64 | from ..speculative import MTPEagleWorker, MTPSpecMetadata, MTPWorker
|
65 |
| -from ..utils import (AuxStreamType, EventType, Fp4QuantizedTensor, |
66 |
| - disable_fp4_allgather) |
| 65 | +from ..utils import AuxStreamType, EventType, Fp4QuantizedTensor |
67 | 66 | from .modeling_utils import (DecoderModel, DecoderModelForCausalLM,
|
68 | 67 | EagerFusionConfig, filter_weights,
|
69 | 68 | register_auto_model)
|
@@ -512,16 +511,9 @@ def compute_routed_output(self, hidden_states, hidden_states_fp4,
|
512 | 511 | # max-throughput
|
513 | 512 | use_dp_padding = False
|
514 | 513 | if self.use_dp and self.mapping.tp_size > 1:
|
515 |
| - # MoE use static heuristic to check alltoall enabled or not, however, for wide_ep, the alltoall could also be dynamically disabled when chunking is used or TRTLLM_DEEP_EP_TOKEN_LIMIT is hit. |
516 |
| - is_wide_ep_alltoall_disabled = isinstance( |
517 |
| - self.experts, WideEPMoE) and not self.experts.can_use_alltoall( |
518 |
| - hidden_states, all_rank_num_tokens) |
519 |
| - alltoall_enabled = self.experts.enable_alltoall and not is_wide_ep_alltoall_disabled |
520 |
| - |
521 | 514 | # FP4 all_gather moves this bf16 allgather in to after topk and fp4 quantization
|
522 | 515 | # to reduce allreduce BW
|
523 |
| - if (disable_fp4_allgather() and not alltoall_enabled) or isinstance( |
524 |
| - self.experts, TRTLLMGenFusedMoE): |
| 516 | + if isinstance(self.experts, TRTLLMGenFusedMoE): |
525 | 517 | hidden_states = allgather(hidden_states,
|
526 | 518 | self.mapping,
|
527 | 519 | dim=0,
|
|
0 commit comments