vllm-project · robertgshaw2-redhat · Jul 19, 2024 · Jul 18, 2024 · Jul 18, 2024 · Jul 18, 2024
@@ -27,7 +27,8 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
                    device="cuda") + 1e-6  # avoid nans
 
     ref_out, ref_scales = ref_dynamic_per_token_quant(x, torch.float8_e4m3fn)
-    ops_out, ops_scales = ops.dynamic_per_token_scaled_fp8_quant(x)
+    ops_out, ops_scales = ops.scaled_fp8_quant(x,
+                                               use_per_token_if_dynamic=True)
 
     assert torch.allclose(ref_scales, ops_scales)
     assert torch.allclose(ref_out.to(dtype=torch.float32),

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -300,6 +300,7 @@ def scaled_fp8_quant(
     input: torch.Tensor,
     scale: Optional[torch.Tensor] = None,
     batch_dim_padding: Optional[int] = None,
+    use_per_token_if_dynamic: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP8 and return quantized tensor and scale.
@@ -315,6 +316,8 @@ def scaled_fp8_quant(
         scale: Optional scaling factor for the FP8 quantization
         batch_dim_padding: If specified, pad the first dimension
             of the output to at least this value.
+        use_per_token_if_dynamic: Whether to do per_tensor or per_token 
+            in the dynamic quantization case.
 
     Returns:
         Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
@@ -328,22 +331,19 @@ def scaled_fp8_quant(
     else:
         output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     if scale is None:
-        scale = torch.zeros(1, device=input.device, dtype=torch.float32)
-        torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
+        if use_per_token_if_dynamic:
+            scale = torch.empty((input.numel() // input.shape[-1], 1),
+                                device=input.device,
+                                dtype=torch.float32)
+            torch.ops._C.dynamic_per_token_scaled_fp8_quant(
+                output, input, scale)
+        else:
+            scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         torch.ops._C.static_scaled_fp8_quant(output, input, scale)
-    return output, scale
-
 
-def dynamic_per_token_scaled_fp8_quant(
-        input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-
-    output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
-    scales = torch.empty((input.numel() // input.shape[-1], 1),
-                         device=input.device,
-                         dtype=torch.float32)
-    torch.ops._C.dynamic_per_token_scaled_fp8_quant(output, input, scales)
-    return output, scales
+    return output, scale
 
 
 # int8

diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -113,7 +113,9 @@ def apply_fp8_linear(
     #   If static, layer.input_scale is scalar and x_scale is input_scale.
 
     if cutlass_fp8_supported:
-        qinput, x_scale = ops.scaled_fp8_quant(input, input_scale)
+        qinput, x_scale = ops.scaled_fp8_quant(input,
+                                               input_scale,
+                                               use_per_token_if_dynamic=True)
 
         # Fused GEMM_DQ
         output = ops.cutlass_scaled_mm(qinput,