From 4d36c48f2799e3372ed75d7d6e721c57f1c9631b Mon Sep 17 00:00:00 2001
From: Elsa Granger <6374697+zeyugao@users.noreply.github.com>
Date: Sun, 28 Jul 2024 23:13:49 +0800
Subject: [PATCH] [Misc] Pass cutlass_fp8_supported correctly in fbgemm_fp8
 (#6871)

Signed-off-by: Alvant <alvasian@yandex.ru>
---
 .../layers/quantization/fbgemm_fp8.py         | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 5e8d1f1947421..e7c3859967c71 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -9,6 +9,7 @@
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.fp8 import cutlass_fp8_supported
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -72,6 +73,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: FBGEMMFp8Config):
         self.quant_config = quant_config
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
 
     def create_weights(
         self,
@@ -139,11 +141,12 @@ def apply(self,
                 size_k=layer.input_size_per_partition,
                 bias=bias)
 
-        return apply_fp8_linear(input=x,
-                                weight=layer.weight,
-                                weight_scale=layer.weight_scale,
-                                input_scale=None,
-                                input_scale_ub=layer.input_scale_ub,
-                                bias=bias,
-                                cutlass_fp8_supported=True,
-                                use_per_token_if_dynamic=True)
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=None,
+            input_scale_ub=layer.input_scale_ub,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+            use_per_token_if_dynamic=True)