[Bugfix] Fix GPTQ and GPTQ Marlin CPU Offloading (vllm-project#7225)

KuntaiDu · Aug 7, 2024 · 41febb0 · 41febb0
1 parent c77bc36
commit 41febb0
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 14 deletions.
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
@@ -22,11 +22,28 @@ def test_cpu_offload_fp8():
                          ["--cpu-offload-gb", "2"])
 
 
-@pytest.mark.skipif(not is_quant_method_supported("awq"),
-                    reason="awq is not supported on this GPU type.")
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="gptq_marlin is not supported on this GPU type.")
+def test_cpu_offload_gptq():
+    # Test GPTQ Marlin
+    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
+                         ["--cpu-offload-gb", "1"])
+    # Test GPTQ
+    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
+                         ["--quantization", "gptq"],
+                         ["--quantization", "gptq", "--cpu-offload-gb", "1"])
+
+
+@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
+                    reason="awq_marlin is not supported on this GPU type.")
 def test_cpu_offload_awq():
-    compare_two_settings("casperhansen/llama-3-8b-instruct-awq", [],
-                         ["--cpu-offload-gb", "2"])
+    # Test AWQ Marlin
+    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
+                         ["--cpu-offload-gb", "1"])
+    # Test AWQ
+    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ",
+                         ["--quantization", "awq"],
+                         ["--quantization", "awq", "--cpu-offload-gb", "1"])
 
 
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),

diff --git a/tests/utils.py b/tests/utils.py
@@ -266,8 +266,9 @@ def compare_two_settings(model: str,
     arg1_results = results[:n]
     arg2_results = results[n:]
     for arg1_result, arg2_result in zip(arg1_results, arg2_results):
-        assert arg1_result == arg2_result, \
-            f"Results for {model=} are not the same with {arg1=} and {arg2=}"
+        assert arg1_result == arg2_result, (
+            f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
+            f"{arg1_result=} != {arg2_result=}")
 
 
 def init_test_distributed_environment(

diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
@@ -204,13 +204,7 @@ def create_weights(
 
         layer.exllama_state = exllama_state
 
-    def apply(self,
-              layer: torch.nn.Module,
-              x: torch.Tensor,
-              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        qweight = layer.qweight
-        out_shape = x.shape[:-1] + (qweight.shape[-1], )
-        reshaped_x = x.reshape(-1, x.shape[-1])
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # exllama needs to shuffle the weight after the weight is loaded
         # here we do the shuffle on first forward pass
         if layer.exllama_state == ExllamaState.UNINITIALIZED:
@@ -222,6 +216,14 @@ def apply(self,
             layer.exllama_state = ExllamaState.READY
             ops.gptq_shuffle(layer.qweight, layer.g_idx,
                              self.quant_config.weight_bits)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        out_shape = x.shape[:-1] + (layer.qweight.shape[-1], )
+        reshaped_x = x.reshape(-1, x.shape[-1])
+
         output = ops.gptq_gemm(reshaped_x, layer.qweight, layer.qzeros,
                                layer.scales, layer.g_idx,
                                layer.exllama_state == ExllamaState.READY,

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -251,7 +251,6 @@ def create_weights(
                 scales_and_zp_size,
                 output_size_per_partition // self.quant_config.pack_factor,
                 dtype=torch.int32,
-                device="meta",
             ),
             requires_grad=False,
         )