NVIDIA
diff --git a/‎cpp/tensorrt_llm/thop/allgatherOp.cpp‎
Lines changed: 41 additions & 50 deletions b/‎cpp/tensorrt_llm/thop/allgatherOp.cpp‎
Lines changed: 41 additions & 50 deletions
diff --git a/‎cpp/tensorrt_llm/thop/reducescatterOp.cpp‎
Lines changed: 41 additions & 51 deletions b/‎cpp/tensorrt_llm/thop/reducescatterOp.cpp‎
Lines changed: 41 additions & 51 deletions
diff --git a/‎examples/llm-api/quickstart_advanced.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/llm-api/quickstart_advanced.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎jenkins/Build.groovy‎
Lines changed: 7 additions & 6 deletions b/‎jenkins/Build.groovy‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_deepseekv3.py‎
Lines changed: 2 additions & 5 deletions b/‎tensorrt_llm/_torch/models/modeling_deepseekv3.py‎
Lines changed: 2 additions & 5 deletions
@@ -55,70 +55,61 @@ class AllgatherOp
         return 0;
     }
 
-    torch::Tensor run(torch::Tensor input, torch::optional<torch::List<int64_t>> sizes)
+    std::vector<torch::Tensor> run_list(torch::TensorList input_list, torch::optional<torch::List<int64_t>> sizes)
     {
         TLLM_CHECK_WITH_INFO(mNcclComm.get() != nullptr, "mNcclComm should be initialized before used");
-        auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
-        auto type = tensorrt_llm::runtime::TorchUtils::dataType(input.scalar_type());
-        std::vector<int64_t> outputShape = input.sizes().vec();
-        if (sizes.has_value())
-        {
-            outputShape[0] = std::accumulate(sizes.value().begin(), sizes.value().end(), 0, std::plus<>{});
-        }
-        else
-        {
-            outputShape[0] *= mGroup.size();
-        }
-        auto output = torch::empty(outputShape, input.options());
         bool use_nccl_allgather = !sizes.has_value()
             || std::all_of(sizes.value().begin(), sizes.value().end(),
                 [&sizes](int64_t size) { return size == sizes.value()[0]; });
-        if (use_nccl_allgather)
-        {
-            NCCLCHECK_THROW(ncclAllGather(input.data_ptr(), output.mutable_data_ptr(), input.numel(),
-                (*getDtypeMap())[type], *mNcclComm, stream));
-        }
-        else
-        {
-            size_t numel_base = std::accumulate(outputShape.cbegin() + 1, outputShape.cend(), 1, std::multiplies<>{});
-            int64_t split_offset = 0;
-            ncclGroupStart();
-            for (int root = 0; root < static_cast<int>(mGroup.size()); ++root)
-            {
-                auto split_size = sizes.value()[root];
-                NCCLCHECK_THROW(ncclBroadcast(input.data_ptr(),
-                    output.index({torch::indexing::Slice(split_offset, torch::indexing::None)}).mutable_data_ptr(),
-                    numel_base * split_size, (*getDtypeMap())[type], root, *mNcclComm, stream));
-                split_offset += split_size;
-            }
-            ncclGroupEnd();
-        }
-        return output;
-    }
-
-    std::vector<torch::Tensor> run_list(torch::TensorList input_list, torch::optional<torch::List<int64_t>> sizes)
-    {
+        int64_t sum_sizes
+            = sizes.has_value() ? std::accumulate(sizes.value().begin(), sizes.value().end(), 0, std::plus<>{}) : 0;
         std::vector<torch::Tensor> output_list;
         output_list.reserve(input_list.size());
-        bool use_nccl_allgather = !sizes.has_value()
-            || std::all_of(sizes.value().begin(), sizes.value().end(),
-                [&sizes](int64_t size) { return size == sizes.value()[0]; });
-        if (use_nccl_allgather)
-        {
-            ncclGroupStart();
-        }
+        ncclGroupStart();
         for (auto const& input : input_list)
         {
-            auto output = run(input, sizes);
+            auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
+            auto type = tensorrt_llm::runtime::TorchUtils::dataType(input.scalar_type());
+            std::vector<int64_t> outputShape = input.sizes().vec();
+            if (sizes.has_value())
+            {
+                outputShape[0] = sum_sizes;
+            }
+            else
+            {
+                outputShape[0] *= mGroup.size();
+            }
+            auto output = torch::empty(outputShape, input.options());
+            if (use_nccl_allgather)
+            {
+                ncclAllGather(input.data_ptr(), output.mutable_data_ptr(), input.numel(), (*getDtypeMap())[type],
+                    *mNcclComm, stream);
+            }
+            else
+            {
+                size_t numel_base
+                    = std::accumulate(outputShape.cbegin() + 1, outputShape.cend(), 1, std::multiplies<>{});
+                int64_t split_offset = 0;
+                for (int root = 0; root < static_cast<int>(mGroup.size()); ++root)
+                {
+                    auto split_size = sizes.value()[root];
+                    ncclBroadcast(input.data_ptr(),
+                        output.index({torch::indexing::Slice(split_offset, torch::indexing::None)}).mutable_data_ptr(),
+                        numel_base * split_size, (*getDtypeMap())[type], root, *mNcclComm, stream);
+                    split_offset += split_size;
+                }
+            }
             output_list.push_back(output);
         }
-        if (use_nccl_allgather)
-        {
-            ncclGroupEnd();
-        }
+        NCCLCHECK_THROW(ncclGroupEnd());
         return output_list;
     }
 
+    torch::Tensor run(torch::Tensor input, torch::optional<torch::List<int64_t>> sizes)
+    {
+        return run_list({input}, sizes)[0];
+    }
+
 private:
     std::set<int> mGroup;
     std::shared_ptr<ncclComm_t> mNcclComm;
 
@@ -55,79 +55,69 @@ class ReducescatterOp
         return 0;
     }
 
-    torch::Tensor run(torch::Tensor const& input, torch::optional<torch::List<int64_t>> sizes)
+    std::vector<torch::Tensor> run_list(torch::TensorList input_list, torch::optional<torch::List<int64_t>> sizes)
     {
         TLLM_CHECK_WITH_INFO(mNcclComm.get() != nullptr, "mNcclComm should be initialized before used");
-        auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
-        auto type = tensorrt_llm::runtime::TorchUtils::dataType(input.scalar_type());
-        std::vector<int64_t> outputShape = input.sizes().vec();
+        bool use_nccl_reducescatter = !sizes.has_value()
+            || std::all_of(sizes.value().begin(), sizes.value().end(),
+                [&sizes](int64_t size) { return size == sizes.value()[0]; });
+        int groupRank = 0;
         if (sizes.has_value())
         {
             auto rank = COMM_SESSION.getRank();
-            int groupRank = 0;
             for (auto const& currentRank : mGroup)
             {
                 if (rank == currentRank)
                     break;
                 ++groupRank;
             }
             TLLM_CHECK(static_cast<size_t>(groupRank) < mGroup.size());
-            outputShape[0] = sizes.value()[groupRank];
-        }
-        else
-        {
-            outputShape[0] = outputShape[0] / mGroup.size();
         }
-        auto output = torch::empty(outputShape, input.options());
-        bool use_nccl_reducescatter = !sizes.has_value()
-            || std::all_of(sizes.value().begin(), sizes.value().end(),
-                [&sizes](int64_t size) { return size == sizes.value()[0]; });
-        if (use_nccl_reducescatter)
-        {
-            NCCLCHECK_THROW(ncclReduceScatter(input.data_ptr(), output.mutable_data_ptr(), output.numel(),
-                (*getDtypeMap())[type], ncclSum, *mNcclComm, stream));
-        }
-        else
+        std::vector<torch::Tensor> output_list;
+        output_list.reserve(input_list.size());
+        ncclGroupStart();
+        for (auto const& input : input_list)
         {
-            size_t numel_base = std::accumulate(outputShape.cbegin() + 1, outputShape.cend(), 1, std::multiplies<>{});
-            int64_t split_offset = 0;
-            ncclGroupStart();
-            for (int root = 0; root < static_cast<int>(mGroup.size()); ++root)
+            auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
+            auto type = tensorrt_llm::runtime::TorchUtils::dataType(input.scalar_type());
+            std::vector<int64_t> outputShape = input.sizes().vec();
+            if (sizes.has_value())
             {
-                auto split_size = sizes.value()[root];
-                NCCLCHECK_THROW(
+                outputShape[0] = sizes.value()[groupRank];
+            }
+            else
+            {
+                outputShape[0] = outputShape[0] / mGroup.size();
+            }
+            auto output = torch::empty(outputShape, input.options());
+            if (use_nccl_reducescatter)
+            {
+                ncclReduceScatter(input.data_ptr(), output.mutable_data_ptr(), output.numel(), (*getDtypeMap())[type],
+                    ncclSum, *mNcclComm, stream);
+            }
+            else
+            {
+                size_t numel_base
+                    = std::accumulate(outputShape.cbegin() + 1, outputShape.cend(), 1, std::multiplies<>{});
+                int64_t split_offset = 0;
+                for (int root = 0; root < static_cast<int>(mGroup.size()); ++root)
+                {
+                    auto split_size = sizes.value()[root];
                     ncclReduce(input.index({torch::indexing::Slice(split_offset, torch::indexing::None)}).data_ptr(),
                         output.mutable_data_ptr(), numel_base * split_size, (*getDtypeMap())[type], ncclSum, root,
-                        *mNcclComm, stream));
-                split_offset += split_size;
+                        *mNcclComm, stream);
+                    split_offset += split_size;
+                }
             }
-            ncclGroupEnd();
+            output_list.push_back(output);
         }
-        return output;
+        NCCLCHECK_THROW(ncclGroupEnd());
+        return output_list;
     }
 
-    std::vector<torch::Tensor> run_list(
-        torch::TensorList input_list, torch::optional<torch::List<int64_t>> sizes) noexcept
+    torch::Tensor run(torch::Tensor const& input, torch::optional<torch::List<int64_t>> sizes)
     {
-        std::vector<torch::Tensor> output_list;
-        output_list.reserve(input_list.size());
-        bool use_nccl_reducescatter = !sizes.has_value()
-            || std::all_of(sizes.value().begin(), sizes.value().end(),
-                [&sizes](int64_t size) { return size == sizes.value()[0]; });
-        if (use_nccl_reducescatter)
-        {
-            ncclGroupStart();
-        }
-        for (auto const& input : input_list)
-        {
-            auto output = run(input, sizes);
-            output_list.push_back(output);
-        }
-        if (use_nccl_reducescatter)
-        {
-            ncclGroupEnd();
-        }
-        return output_list;
+        return run_list({input}, sizes)[0];
     }
 
 private:
 
@@ -50,7 +50,7 @@ def add_llm_args(parser):
     parser.add_argument('--moe_backend',
                         type=str,
                         default='CUTLASS',
-                        choices=['CUTLASS', 'TRTLLM', 'VANILLA'])
+                        choices=['CUTLASS', 'TRTLLM', 'VANILLA', 'WIDEEP'])
     parser.add_argument('--enable_attention_dp',
                         default=False,
                         action='store_true')
 
@@ -306,18 +306,19 @@ def uploadArtifacts(artifacts, prefix = UPLOAD_PATH, retryTimes = 2, serverId =
     for (it in artifacts) {
         def uploadpath = it.key
         def filepath = it.value
-        echo "uploading ${filepath} as ${uploadpath}"
-        trtllm_utils.llmRetry(retryTimes, "uploadArtifacts", {
-            rtUpload (
-                serverId: serverId,
-                spec: """{
+        def spec = """{
                     "files": [
                         {
                         "pattern": "${filepath}",
                         "target": "${prefix}/${uploadpath}"
                         }
                     ]
-                }""",
+                }"""
+        echo "Uploading ${filepath} as ${uploadpath}. Spec: ${spec}"
+        trtllm_utils.llmRetry(retryTimes, "uploadArtifacts", {
+            rtUpload (
+                serverId: serverId,
+                spec: spec,
             )
         })
     }
 
@@ -62,8 +62,7 @@
 from ..modules.rms_norm import RMSNorm
 from ..peft.lora.layer import LoraLayer
 from ..speculative import MTPEagleWorker, MTPSpecMetadata, MTPWorker
-from ..utils import (AuxStreamType, EventType, Fp4QuantizedTensor,
-                     disable_fp4_allgather)
+from ..utils import AuxStreamType, EventType, Fp4QuantizedTensor
 from .modeling_utils import (DecoderModel, DecoderModelForCausalLM,
                              EagerFusionConfig, filter_weights,
                              register_auto_model)
@@ -514,9 +513,7 @@ def compute_routed_output(self, hidden_states, hidden_states_fp4,
         if self.use_dp and self.mapping.tp_size > 1:
             # FP4 all_gather moves this bf16 allgather in to after topk and fp4 quantization
             # to reduce allreduce BW
-            if (disable_fp4_allgather()
-                    and not self.experts.enable_alltoall) or isinstance(
-                        self.experts, TRTLLMGenFusedMoE):
+            if isinstance(self.experts, TRTLLMGenFusedMoE):
                 hidden_states = allgather(hidden_states,
                                           self.mapping,
                                           dim=0,
Original file line number	Diff line number	Diff line change
`@@ -306,18 +306,19 @@ def uploadArtifacts(artifacts, prefix = UPLOAD_PATH, retryTimes = 2, serverId =`
`306`	`306`	`for (it in artifacts) {`
`307`	`307`	`def uploadpath = it.key`
`308`	`308`	`def filepath = it.value`
`309`		`- echo "uploading ${filepath} as ${uploadpath}"`
`310`		`- trtllm_utils.llmRetry(retryTimes, "uploadArtifacts", {`
`311`		`- rtUpload (`
`312`		`- serverId: serverId,`
`313`		`- spec: """{`
	`309`	`+ def spec = """{`
`314`	`310`	`"files": [`
`315`	`311`	`{`
`316`	`312`	`"pattern": "${filepath}",`
`317`	`313`	`"target": "${prefix}/${uploadpath}"`
`318`	`314`	`}`
`319`	`315`	`]`
`320`		`- }""",`
	`316`	`+ }"""`
	`317`	`+ echo "Uploading ${filepath} as ${uploadpath}. Spec: ${spec}"`
	`318`	`+ trtllm_utils.llmRetry(retryTimes, "uploadArtifacts", {`
	`319`	`+ rtUpload (`
	`320`	`+ serverId: serverId,`
	`321`	`+ spec: spec,`
`321`	`322`	`)`
`322`	`323`	`})`
`323`	`324`	`}`