diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index 6cf156459a61b..9aaffcd1823cc 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -73,6 +73,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase): def __init__(self, quant_config: FBGEMMFp8Config): self.quant_config = quant_config + self.cutlass_fp8_supported = cutlass_fp8_supported() def create_weights( self, @@ -146,5 +147,5 @@ def apply(self, input_scale=None, input_scale_ub=layer.input_scale_ub, bias=bias, - cutlass_fp8_supported=cutlass_fp8_supported(), + cutlass_fp8_supported=self.cutlass_fp8_supported, use_per_token_if_dynamic=True)