refine weight-only linear method

Yuening-wa · Yuening-wa · commit 4e55b3a3bb5e · 2025-07-20T14:01:39.000+08:00
Signed-off-by: Yuening Li &lt;62227368+yueningl@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -110,16 +110,12 @@ def load_weights_vanilla_helper(module: Linear, weights: List[Dict]):
     weight = load_weight_shard(weights[0]['weight'], module.tp_size,
                                module.tp_rank, module.tp_mode, device)
 
-    if module.has_w4a16_awq or module.has_weight_only_quant:
+    if module.has_weight_only_quant:
         # NOTE: without the preprocess during the runtime, the gemm output nan's. in order to use the preprocess_weights_for_mixed_gemm
         # we need to cast the weight to int8 first.
-        if module.has_w4a16_awq or module.quant_config.layer_quant_mode.is_int4_weight_only(
-        ):
-            quant_mode = torch.quint4x2
-        elif module.quant_config.layer_quant_mode.is_int8_weight_only():
-            quant_mode = torch.int8
+        weight_dtype, _ = get_weight_dtype_and_id(module)
         weight = preprocess_weights_for_mixed_gemm(
-            weight.T.to(torch.int8).contiguous().cpu(), quant_mode,
+            weight.T.to(torch.int8).contiguous().cpu(), weight_dtype,
             torch.float16).cuda().contiguous()
 
     copy_weight(module.weight, weight)
@@ -174,6 +170,27 @@ def load_weights_fused_gate_up_helper(
     return (gate_weight, up_weight)
 
 
+def get_weight_dtype_and_id(module: Linear) -> tuple[torch.dtype, int]:
+    """
+    Get weight dtype and weight_id for weight only quantization mode.
+
+    Returns:
+        tuple[torch.dtype, int]: (weight_dtype, weight_id) where:
+            - weight_dtype: torch.int8 for INT8 weights, torch.quint4x2 for INT4 weights
+            - weight_id: 1 for INT8, 2 for INT4 (used for weight packing)
+    """
+    assert module.quant_config is not None and module.quant_config.layer_quant_mode.is_weight_only(
+    ), "This function should only be called when the module has weight-only quantization enabled."
+
+    if module.quant_config.layer_quant_mode.is_int8_weight_only():
+        return torch.int8, 1
+    elif module.quant_config.layer_quant_mode.is_int4_weight_only():
+        return torch.quint4x2, 2
+    else:
+        raise ValueError(
+            f"Unsupported quant_mode: {module.quant_config.layer_quant_mode}")
+
+
 class LinearMethodBase(ABC):
     """
     Base class for all linear methods.
@@ -232,20 +249,6 @@ def load_weights_fused_gate_up_linear(self, module: Linear,
         """
         raise NotImplementedError
 
-    def _get_weight_dtype_and_id(self,
-                                 module: Linear) -> tuple[torch.dtype, int]:
-        """
-        get weight dtype and weight_id for weight only quantization mode
-        """
-        if module.quant_config.layer_quant_mode.is_int8_weight_only():
-            return torch.int8, 1
-        elif module.quant_config.layer_quant_mode.is_int4_weight_only():
-            return torch.quint4x2, 2
-        else:
-            raise ValueError(
-                f"Unsupported quant_mode: {module.quant_config.layer_quant_mode}"
-            )
-
 
 class UnquantizedLinearMethod(LinearMethodBase):
 
@@ -900,7 +903,7 @@ def create_weights(self, module: Linear, in_features: int,
                        out_features: int, bias: bool,
                        dtype: torch.dtype) -> None:
 
-        _, weight_id = self._get_weight_dtype_and_id(module)
+        _, weight_id = get_weight_dtype_and_id(module)
 
         # Quantized weights (int4 weights are packed into int8)
         module.weight = Parameter(torch.empty(
@@ -920,12 +923,12 @@ def create_weights(self, module: Linear, in_features: int,
     def apply(self, module: Linear, input: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
 
-        weight_dtype, _ = self._get_weight_dtype_and_id(module)
+        weight_dtype, _ = get_weight_dtype_and_id(module)
         bias = bias.contiguous() if bias is not None else None
 
         output = torch.ops.trtllm.weight_only_quant_gemm(
-            input.to(module.dtype).contiguous(), module.weight, weight_dtype,
-            module.weight_scale, module.dtype)
+            input, module.weight, weight_dtype, module.weight_scale,
+            module.dtype)
 
         return output
 
@@ -972,7 +975,7 @@ def load_weights_fused_qkv_linear(self, module: Linear,
 
         fused_weight = torch.cat((q_weight, k_weight, v_weight))
 
-        weight_dtype, _ = self._get_weight_dtype_and_id(module)
+        weight_dtype, _ = get_weight_dtype_and_id(module)
         fused_weight = preprocess_weights_for_mixed_gemm(
             fused_weight.to(torch.int8).T.contiguous().cpu(), weight_dtype,
             torch.float16).cuda().contiguous()
@@ -988,10 +991,10 @@ def load_weights_fused_qkv_linear(self, module: Linear,
     def load_weights_fused_gate_up_linear(self, module: Linear,
                                           weights: List[Dict]) -> None:
         device = torch.device('cuda')
-        weight_dtype, _ = self._get_weight_dtype_and_id(module)
-
+        weight_dtype, _ = get_weight_dtype_and_id(module)
         gate_weight, up_weight = load_weights_fused_gate_up_helper(
             module, weights)
+
         fused_weight = torch.cat((gate_weight, up_weight))
 
         fused_weight = preprocess_weights_for_mixed_gemm(
@@ -1050,8 +1053,7 @@ def apply(self, module: Linear, input: torch.Tensor,
 
         bias = bias.contiguous() if bias is not None else None
 
-        output = torch.ops.trtllm.w4a16_gemm(input.to(
-            module.dtype).contiguous(),
+        output = torch.ops.trtllm.w4a16_gemm(input,
                                              module.weight,
                                              module.weight_scale.T.contiguous(),
                                              module.quant_config.group_size,