Skip to content

Commit

Permalink
[Bugfix] Fix GPTQ and GPTQ Marlin CPU Offloading (vllm-project#7225)
Browse files Browse the repository at this point in the history
  • Loading branch information
mgoin authored Aug 7, 2024
1 parent c77bc36 commit 41febb0
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 14 deletions.
25 changes: 21 additions & 4 deletions tests/basic_correctness/test_cpu_offload.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,28 @@ def test_cpu_offload_fp8():
["--cpu-offload-gb", "2"])


@pytest.mark.skipif(not is_quant_method_supported("awq"),
reason="awq is not supported on this GPU type.")
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_gptq():
# Test GPTQ Marlin
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
["--cpu-offload-gb", "1"])
# Test GPTQ
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
["--quantization", "gptq"],
["--quantization", "gptq", "--cpu-offload-gb", "1"])


@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
reason="awq_marlin is not supported on this GPU type.")
def test_cpu_offload_awq():
compare_two_settings("casperhansen/llama-3-8b-instruct-awq", [],
["--cpu-offload-gb", "2"])
# Test AWQ Marlin
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
["--cpu-offload-gb", "1"])
# Test AWQ
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ",
["--quantization", "awq"],
["--quantization", "awq", "--cpu-offload-gb", "1"])


@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
Expand Down
5 changes: 3 additions & 2 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,9 @@ def compare_two_settings(model: str,
arg1_results = results[:n]
arg2_results = results[n:]
for arg1_result, arg2_result in zip(arg1_results, arg2_results):
assert arg1_result == arg2_result, \
f"Results for {model=} are not the same with {arg1=} and {arg2=}"
assert arg1_result == arg2_result, (
f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
f"{arg1_result=} != {arg2_result=}")


def init_test_distributed_environment(
Expand Down
16 changes: 9 additions & 7 deletions vllm/model_executor/layers/quantization/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,13 +204,7 @@ def create_weights(

layer.exllama_state = exllama_state

def apply(self,
layer: torch.nn.Module,
x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
qweight = layer.qweight
out_shape = x.shape[:-1] + (qweight.shape[-1], )
reshaped_x = x.reshape(-1, x.shape[-1])
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
# exllama needs to shuffle the weight after the weight is loaded
# here we do the shuffle on first forward pass
if layer.exllama_state == ExllamaState.UNINITIALIZED:
Expand All @@ -222,6 +216,14 @@ def apply(self,
layer.exllama_state = ExllamaState.READY
ops.gptq_shuffle(layer.qweight, layer.g_idx,
self.quant_config.weight_bits)

def apply(self,
layer: torch.nn.Module,
x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
out_shape = x.shape[:-1] + (layer.qweight.shape[-1], )
reshaped_x = x.reshape(-1, x.shape[-1])

output = ops.gptq_gemm(reshaped_x, layer.qweight, layer.qzeros,
layer.scales, layer.g_idx,
layer.exllama_state == ExllamaState.READY,
Expand Down
1 change: 0 additions & 1 deletion vllm/model_executor/layers/quantization/gptq_marlin.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,6 @@ def create_weights(
scales_and_zp_size,
output_size_per_partition // self.quant_config.pack_factor,
dtype=torch.int32,
device="meta",
),
requires_grad=False,
)
Expand Down

0 comments on commit 41febb0

Please sign in to comment.