Fix input_scale typing in w8a8_utils.py

vllm-project · Jul 19, 2024 · d048985 · d048985
1 parent a921e86
commit d048985
Showing 1 changed file with 2 additions and 2 deletions.
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -104,7 +104,7 @@ def apply_fp8_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
     weight_scale: torch.Tensor,
-    input_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
     bias: Optional[torch.Tensor] = None,
     cutlass_fp8_supported: bool = True,
 ) -> torch.Tensor:
@@ -176,7 +176,7 @@ def apply_int8_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
     weight_scale: torch.Tensor,
-    input_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
     bias: Optional[torch.Tensor] = None,
 ):
     # ops.scaled_int8_quant supports both dynamic and static quant.