NVIDIA
diff --git a/‎3rdparty/cutlass‎ b/‎3rdparty/cutlass‎
diff --git a/‎cpp/include/tensorrt_llm/common/cudaUtils.h‎
Lines changed: 2 additions & 2 deletions b/‎cpp/include/tensorrt_llm/common/cudaUtils.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/micro_benchmarks/CMakeLists.txt‎
Lines changed: 0 additions & 7 deletions b/‎cpp/micro_benchmarks/CMakeLists.txt‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h‎
Lines changed: 29 additions & 2 deletions b/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h‎
Lines changed: 29 additions & 2 deletions
@@ -1339,10 +1339,10 @@ struct ConstExprWrapper
 };
 
 template <int VALUE>
-using Int = ConstExprWrapper<int, VALUE>;
+using ConstInt = ConstExprWrapper<int, VALUE>;
 
 template <bool VALUE>
-using Bool = ConstExprWrapper<bool, VALUE>;
+using ConstBool = ConstExprWrapper<bool, VALUE>;
 
 template <typename T>
 struct TmaDescType;
 
@@ -50,12 +50,5 @@ function(add_benchmark test_name test_src)
   add_dependencies(micro_benchmarks ${test_name})
 endfunction()
 
-# currently only support internal-cutlass lib version
 add_benchmark(mixtureOfExpertsBackendBenchmark
               mixtureOfExpertsBackendBenchmarkLauncher.cu)
-# Temporary opend-sourced version. Will be daleted when open-sourced moe_gemm
-# support MXFP4
-if(USING_OSS_CUTLASS_MOE_GEMM)
-  add_benchmark(mixtureOfExpertsBackendBenchmarkOss
-                mixtureOfExpertsBackendBenchmarkLauncherOss.cu)
-endif()
@@ -20,7 +20,14 @@
 
 #include <nlohmann/json.hpp>
 
+#ifdef USING_OSS_CUTLASS_MOE_GEMM
+#include "tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h"
+#else
 #include "moe_kernels.h"
+#endif
+
+#include "tensorrt_llm/kernels/cutlass_kernels/include/cutlass_kernel_selector.h"
+
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/memoryUtils.h"
 #include "tensorrt_llm/common/nvtxUtils.h"
@@ -42,6 +49,12 @@ using namespace tensorrt_llm::common;
 using namespace tensorrt_llm::runtime;
 using namespace tensorrt_llm::cutlass_extensions;
 
+using namespace CUTLASS_MOE_GEMM_KERNELS_NAMESPACE;
+using CUTLASS_MOE_GEMM_NAMESPACE::TmaWarpSpecializedGroupedGemmInput;
+using CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::CutlassMoeFCRunner;
+using CUTLASS_MOE_GEMM_NAMESPACE::ActivationType;
+using CUTLASS_MOE_GEMM_NAMESPACE::isGatedActivation;
+
 static BufferManager::CudaStreamPtr streamPtr;
 static std::unique_ptr<BufferManager> bufferManager;
 static int deviceCount;
@@ -485,7 +498,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
     bool mIsGated = false;
     int mGatedMultiplier = 1;
 
-    tensorrt_llm::ActivationType mActType = tensorrt_llm::ActivationType::Relu;
+    ActivationType mActType = ActivationType::Relu;
 
     QuantParams mQuantParams{};
     bool mUseLora = false;
@@ -650,9 +663,15 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
             "Tactic Profiling GEMM " + std::to_string(static_cast<int>(gemm_to_profile)));
 
         GemmProfilerBackend profiler;
+#ifdef USING_OSS_CUTLASS_MOE_GEMM
+        profiler.init(mMoERunner, gemm_to_profile, typeToDtypeID<DataType>(), typeToDtypeID<WeightType>(),
+            typeToDtypeID<OutputType>(), mNumExperts, mK, mHiddenSize, mInterSize, mGroupSize, mActType, mUseBias,
+            mUseLora, /*min_latency_mode=*/false, /*need_weights=*/true, parallelism_config, /*enable_alltoall=*/false);
+#else
         profiler.init(mMoERunner, gemm_to_profile, typeToDtypeID<DataType>(), typeToDtypeID<WeightType>(),
             typeToDtypeID<OutputType>(), mNumExperts, mK, mHiddenSize, mInterSize, mGroupSize, mActType, mUseBias,
             mUseLora, /*min_latency_mode=*/false, /*need_weights=*/true, parallelism_config);
+#endif
         auto workspace_size = profiler.getWorkspaceSize(mTotalTokens);
         auto workspace = bufferManager->gpu(workspace_size);
 
@@ -760,11 +779,19 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
     {
         auto stream = streamPtr->get();
         MoeMinLatencyParams min_latency_params;
+#ifdef USING_OSS_CUTLASS_MOE_GEMM
         mMoERunner.runMoe(mInputTensor, nullptr, mSelectedExperts, mUseFinalScale ? mScaleProbs : nullptr,
             mExpertWeight1, mExpertBias1, mActType, mExpertWeight2, mExpertBias2, mQuantParams, mTotalTokens,
             mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace, mFinalOutput, mSourceToExpandedMap,
-            parallelism_config, mUseLora, mLoraParams,
+            parallelism_config, /*enable_alltoall=*/false, mUseLora, mLoraParams,
             /*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
+#else
+        mMoERunner.runMoe(mInputTensor, nullptr, mSelectedExperts, mUseFinalScale ? mScaleProbs : nullptr,
+            mExpertWeight1, mExpertBias1, mActType, mExpertWeight2, mExpertBias2, mQuantParams, mTotalTokens,
+            mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace, mFinalOutput, mSourceToExpandedMap,
+            parallelism_config, mUseLora, mLoraParams, /*use_deepseek_fp8_block_scale=*/false,
+            /*min_latency_mode=*/false, min_latency_params, stream);
+#endif
     }
 
     void runBenchmark(benchmark::State& state);