Apply per expert act scale to FC1 for w4a8 moe on PyT flow

yumin066 · yumin066 · commit c0d8b42fa32e · 2025-09-21T23:37:32.000-07:00
Signed-off-by: Min Yu &lt;171526537+yumin066@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu
@@ -1508,14 +1508,18 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp
             static_assert(!is_nvfp4 && !is_mxfp8, "NVFP4 and MXFP8 are not supported for AWQ");
             static_assert(!std::is_same_v<InputActivationsType, ExpandedActivationsType>,
                 "Input and output types must be different for AWQ");
+            int64_t expert = findTotalEltsLessThanTarget(
+                                 expert_first_token_offset, num_experts_per_node, (int64_t) permuted_row + 1)
+                - 1;
             for (int elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride)
             {
                 auto frag_elems = source_row_ptr[elem_index];
 
                 CUTLASS_PRAGMA_UNROLL
                 for (int e = 0; e < ELEM_PER_THREAD; e++)
                 {
-                    frag_elems[e] = frag_elems[e] * prequant_scales[elem_index * ELEM_PER_THREAD + e];
+                    frag_elems[e]
+                        = frag_elems[e] * prequant_scales[expert * hidden_size + elem_index * ELEM_PER_THREAD + e];
                 }
 
                 dest_row_ptr[elem_index] = arrayConvert<DataElem, OutputElem>(frag_elems);
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -910,9 +910,10 @@ def create_weights(self, module: torch.nn.Module):
                            module.intermediate_size_per_partition // 2)
 
         # Multiply act with reciprocal of per-channel pre_quant_scale * per-tensor input_scale
-        fc31_act_scale = nn.Parameter(torch.empty(1,
-                                                  module.hidden_size,
-                                                  dtype=module.dtype),
+        fc31_act_scale = nn.Parameter(torch.empty(
+            module.expert_size_per_partition,
+            module.hidden_size,
+            dtype=module.dtype),
                                       requires_grad=False)
         module.register_parameter("fc31_act_scale", fc31_act_scale)
 
@@ -1125,15 +1126,29 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
                                   device=self.device)
                 for expert_id in module.initial_local_expert_ids
             ]
-            all_w3_w1_pre_quant_scales_max = torch.max(
-                torch.stack(all_w3_pre_quant_scales +
-                            all_w1_pre_quant_scales).to(module.dtype),
+            all_w3_w1_pre_quant_scales_greater = torch.max(
+                torch.stack([
+                    torch.stack(all_w3_pre_quant_scales),
+                    torch.stack(all_w1_pre_quant_scales)
+                ]).to(module.dtype),
+                dim=0,
+            ).values.permute(1, 0)
+
+            all_w3_w1_input_scales_greater = torch.max(
+                torch.stack([
+                    torch.stack(all_w3_input_scales),
+                    torch.stack(all_w1_input_scales)
+                ]).to(module.dtype),
                 dim=0,
             ).values
+
+            all_w3_w1_pre_quant_scales_div_input_scales = (
+                all_w3_w1_pre_quant_scales_greater *
+                (1 / all_w3_w1_input_scales_greater.reshape(
+                    1, module.expert_size_per_partition).float()))
+
             module.fc31_act_scale.data.copy_(
-                torch.ones_like(module.fc31_act_scale, device=self.device) *
-                (all_w3_w1_pre_quant_scales_max) *
-                (1 / all_w3_w1_input_scales_max))
+                all_w3_w1_pre_quant_scales_div_input_scales.permute(1, 0))
             # In vanilla ckpt (at least from ModelOpt), per-tensor weight_scale_2 is separately stored
             all_w3_weight_scale_2 = [
                 load_weight_shard(weights[f"{expert_id}.w3.weight_scale_2"],
@@ -1145,13 +1160,21 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
                                   device=self.device)
                 for expert_id in module.initial_local_expert_ids
             ]
-            all_w3_w1_weight_scale_2_max = torch.max(
-                torch.stack(all_w3_weight_scale_2 + all_w1_weight_scale_2).to(
-                    module.dtype),
-                dim=0,
-            ).values
-            module.fc31_alpha.data.copy_(all_w3_w1_weight_scale_2_max.float() *
-                                         all_w3_w1_input_scales_max.float())
+            all_w3_w1_weight_scale_2 = torch.stack([
+                torch.stack(all_w3_weight_scale_2),
+                torch.stack(all_w1_weight_scale_2)
+            ]).to(module.dtype)
+            all_w3_w1_weight_scale_2_greater = torch.max(
+                all_w3_w1_weight_scale_2, dim=0).values
+
+            all_w3_w1_weight_scale_2_mul_input_scales = (
+                all_w3_w1_weight_scale_2_greater.reshape(
+                    module.expert_size_per_partition, 1).float() *
+                all_w3_w1_input_scales_greater.reshape(
+                    module.expert_size_per_partition, 1).float())
+            module.fc31_alpha.data.copy_(
+                all_w3_w1_weight_scale_2_mul_input_scales.reshape(
+                    module.expert_size_per_partition, 1).float())
 
         # Per-group weight_scale
         all_w3_scales = [
@@ -1179,7 +1202,11 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
             w3_w1_scales = all_w3_w1_scales.to(torch.bfloat16).view(
                 module.dtype)
         if module.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
-            w3_w1_scales /= all_w3_w1_weight_scale_2_max.float()
+            w3_w1_scales = w3_w1_scales.permute(1, 2, 0)
+            w3_w1_scales /= all_w3_w1_weight_scale_2_greater.reshape(
+                module.expert_size_per_partition).float()
+            w3_w1_scales = w3_w1_scales.permute(2, 0, 1)
+
         w3_w1_s_shape = w3_w1_scales.shape
         w3_w1_scales_interleaved = w3_w1_scales.reshape(
             w3_w1_s_shape[0], w3_w1_s_shape[1],

Original file line number	Diff line number	Diff line change
`@@ -1508,14 +1508,18 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp`
`1508`	`1508`	`static_assert(!is_nvfp4 && !is_mxfp8, "NVFP4 and MXFP8 are not supported for AWQ");`
`1509`	`1509`	`static_assert(!std::is_same_v<InputActivationsType, ExpandedActivationsType>,`
`1510`	`1510`	`"Input and output types must be different for AWQ");`
	`1511`	`+ int64_t expert = findTotalEltsLessThanTarget(`
	`1512`	`+ expert_first_token_offset, num_experts_per_node, (int64_t) permuted_row + 1)`
	`1513`	`+ - 1;`
`1511`	`1514`	`for (int elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride)`
`1512`	`1515`	`{`
`1513`	`1516`	`auto frag_elems = source_row_ptr[elem_index];`
`1514`	`1517`
`1515`	`1518`	`CUTLASS_PRAGMA_UNROLL`
`1516`	`1519`	`for (int e = 0; e < ELEM_PER_THREAD; e++)`
`1517`	`1520`	`{`
`1518`		`- frag_elems[e] = frag_elems[e] * prequant_scales[elem_index * ELEM_PER_THREAD + e];`
	`1521`	`+ frag_elems[e]`
	`1522`	`+ = frag_elems[e] * prequant_scales[expert * hidden_size + elem_index * ELEM_PER_THREAD + e];`
`1519`	`1523`	`}`
`1520`	`1524`
`1521`	`1525`	`dest_row_ptr[elem_index] = arrayConvert<DataElem, OutputElem>(frag_elems);`