NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h‎
Lines changed: 27 additions & 23 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h‎
Lines changed: 27 additions & 23 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h‎
Lines changed: 3 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h‎
Lines changed: 3 additions & 2 deletions
@@ -21,6 +21,7 @@
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/quantization.h"
 #include "tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h"
+#include <cstdint>
 #ifdef ENABLE_FP4
 #include <cuda_fp4.h>
 #endif
@@ -451,7 +452,7 @@ class CutlassMoeFCRunnerInterface
     virtual void runMoe(void const* input_activations, void const* input_sf, bool const swizzled_input_sf,
         int const* token_selected_experts, float const* token_final_scales, void const* fc1_expert_weights,
         void const* fc1_expert_biases, ActivationParams fc1_activation_type, void const* fc2_expert_weights,
-        void const* fc2_expert_biases, QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size,
+        void const* fc2_expert_biases, QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size, int64_t const orig_hidden_size,
         int64_t const inter_size, int const num_experts, int const experts_per_token, char* workspace_ptr,
         void* final_output, int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config,
         bool const enable_alltoall, bool use_lora, LoraParams& lora_params, bool use_deepseek_fp8_block_scale,
@@ -480,11 +481,12 @@ class CutlassMoeFCRunnerInterface
         float const* const token_topk_permuted_scales, int const* const unpermuted_row_to_permuted_row,
         int const* permuted_row_to_unpermuted_row, int const* const token_selected_experts,
         int64_t const* const num_valid_tokens_ptr, int64_t const num_rows, int64_t const expanded_num_rows,
-        int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node,
-        int64_t const experts_per_token, float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora,
-        bool use_deepseek_fp8_block_scale, cudaStream_t stream, MOEParallelismConfig parallelism_config,
-        bool const enable_alltoall, cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
-        int* num_active_experts_per, int* active_expert_global_ids)
+        int64_t const hidden_size, int64_t const orig_hidden_size, int64_t const inter_size,
+        int const num_experts_per_node, int64_t const experts_per_token, float const** alpha_scale_ptr_array,
+        bool use_lora, void* fc2_lora, bool use_deepseek_fp8_block_scale, cudaStream_t stream,
+        MOEParallelismConfig parallelism_config, bool const enable_alltoall,
+        cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per,
+        int* active_expert_global_ids)
         = 0;
 
     virtual std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
@@ -606,7 +608,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
     void runMoe(void const* input_activations, void const* input_sf, bool const swizzled_input_sf,
         int const* token_selected_experts, float const* token_final_scales, void const* fc1_expert_weights,
         void const* fc1_expert_biases, ActivationParams fc1_activation_type, void const* fc2_expert_weights,
-        void const* fc2_expert_biases, QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size,
+        void const* fc2_expert_biases, QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size, int64_t const orig_hidden_size,
         int64_t const inter_size, int const num_experts, int const experts_per_token, char* workspace_ptr,
         void* final_output, int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config,
         bool const enable_alltoall, bool use_lora, LoraParams& lora_params, bool use_deepseek_fp8_block_scale,
@@ -641,11 +643,11 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         float const* const token_topk_permuted_scales, int const* const unpermuted_row_to_permuted_row,
         int const* permuted_row_to_unpermuted_row, int const* const token_selected_experts,
         int64_t const* const num_valid_tokens_ptr, int64_t const num_rows, int64_t const expanded_num_rows,
-        int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node,
-        int64_t const experts_per_token, float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora,
-        cudaStream_t stream, MOEParallelismConfig parallelism_config, bool const enable_alltoall,
-        cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per,
-        int* active_expert_global_ids);
+        int64_t const hidden_size, int64_t const orig_hidden_size, int64_t const inter_size,
+        int const num_experts_per_node, int64_t const experts_per_token, float const** alpha_scale_ptr_array,
+        bool use_lora, void* fc2_lora, cudaStream_t stream, MOEParallelismConfig parallelism_config,
+        bool const enable_alltoall, cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
+        int* num_active_experts_per, int* active_expert_global_ids);
 
     // Overrides to allow us to forward on to the internal functions with the pointers using the correct type
     void gemm1(void const* const input, void* const output, void* const intermediate_result,
@@ -678,11 +680,12 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         float const* const token_topk_permuted_scales, int const* const unpermuted_row_to_permuted_row,
         int const* permuted_row_to_unpermuted_row, int const* const token_selected_experts,
         int64_t const* const num_valid_tokens_ptr, int64_t const num_rows, int64_t const expanded_num_rows,
-        int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node,
-        int64_t const experts_per_token, float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora,
-        bool use_deepseek_fp8_block_scale, cudaStream_t stream, MOEParallelismConfig parallelism_config,
-        bool const enable_alltoall, cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
-        int* num_active_experts_per, int* active_expert_global_ids) override
+        int64_t const hidden_size, int64_t const orig_hidden_size, int64_t const inter_size,
+        int const num_experts_per_node, int64_t const experts_per_token, float const** alpha_scale_ptr_array,
+        bool use_lora, void* fc2_lora, bool use_deepseek_fp8_block_scale, cudaStream_t stream,
+        MOEParallelismConfig parallelism_config, bool const enable_alltoall,
+        cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per,
+        int* active_expert_global_ids) override
     {
         auto* block_scale_gemm_runner = use_deepseek_fp8_block_scale ? getDeepSeekBlockScaleGemmRunner() : nullptr;
         return Self::gemm2(moe_gemm_runner_, block_scale_gemm_runner, static_cast<T const*>(input), gemm_output,
@@ -691,9 +694,9 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
             static_cast<ScaleBiasType const*>(fc2_int_scales), fc2_fp8_dequant, fc2_fp4_act_flat, quant_params,
             token_topk_unpermuted_scales, token_topk_permuted_scales, unpermuted_row_to_permuted_row,
             permuted_row_to_unpermuted_row, token_selected_experts, num_valid_tokens_ptr, num_rows, expanded_num_rows,
-            hidden_size, inter_size, num_experts_per_node, experts_per_token, alpha_scale_ptr_array, use_lora, fc2_lora,
-            stream, parallelism_config, enable_alltoall, config, min_latency_mode, num_active_experts_per,
-            active_expert_global_ids);
+            hidden_size, orig_hidden_size, inter_size, num_experts_per_node, experts_per_token, alpha_scale_ptr_array,
+            use_lora, fc2_lora, stream, parallelism_config, enable_alltoall, config, min_latency_mode,
+            num_active_experts_per, active_expert_global_ids);
     }
 
     virtual size_t getGemmWorkspaceSize(int num_experts_per_node) const override
@@ -830,9 +833,9 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         float const* const token_topk_unpermuted_scales, int const* const unpermuted_row_to_permuted_row,
         int const* const permuted_row_to_unpermuted_row, int const* const token_selected_experts,
         int64_t const* const num_valid_tokens_ptr, int64_t const num_rows, int64_t const expanded_num_rows,
-        int64_t const hidden_size, int64_t const inter_size, int64_t const num_experts_per_node, int64_t const k,
-        MOEParallelismConfig parallelism_config, bool const enable_alltoall, QuantParams& quant_params,
-        cudaStream_t stream);
+        int64_t const hidden_size, int64_t const orig_hidden_size, int64_t const inter_size,
+        int64_t const num_experts_per_node, int64_t const k, MOEParallelismConfig parallelism_config,
+        bool const enable_alltoall, QuantParams& quant_params, cudaStream_t stream);
 
     T const* applyPrequantScale(void* smoothed_act, void const* permuted_data, void const* prequant_scales,
         int64_t const* num_valid_tokens_ptr, int64_t const expanded_num_rows, int64_t const seq_len, bool const use_awq,
@@ -953,6 +956,7 @@ struct GemmProfilerBackend
     int64_t mNumExpertsPerNode{};
     int64_t mK{};
     int64_t mExpertHiddenSize{};
+    int64_t mExpertOrigHiddenSize{};
     int64_t mExpertInterSize{};
     int64_t mGroupSize{};
     ActivationType mActivationType{};
 
@@ -66,8 +66,9 @@ void finalizeMoeRoutingKernelLauncher(GemmOutputType const* expanded_permuted_ro
     OutputType* reduced_unpermuted_output, ScaleBiasType const* bias, float const* final_scales,
     int const* unpermuted_row_to_permuted_row, int const* permuted_row_to_unpermuted_row,
     int const* token_selected_experts, int64_t const* expert_first_token_offset, int64_t const num_rows,
-    int64_t const cols, int64_t const experts_per_token, int64_t const num_experts_per_node,
-    MOEParallelismConfig parallelism_config, bool const enable_alltoall, cudaStream_t stream);
+    int64_t const padded_cols, int64_t const orig_cols, int64_t const experts_per_token,
+    int64_t const num_experts_per_node, MOEParallelismConfig parallelism_config, bool const enable_alltoall,
+    cudaStream_t stream);
 
 } // namespace cutlass_kernels
 } // namespace tensorrt_llm::kernels