From 5d9021969c240866cc15221efe48d5815ffdd1e9 Mon Sep 17 00:00:00 2001 From: AlpinDale <52078762+AlpinDale@users.noreply.github.com> Date: Wed, 18 Dec 2024 09:57:31 -0800 Subject: [PATCH] quants: update `qqq` and `gptq_marlin_24` to use AphroditeParameters (#921) --- aphrodite/modeling/layers/linear.py | 3 +- aphrodite/quantization/gptq_marlin_24.py | 100 ++++++++++---------- aphrodite/quantization/qqq.py | 111 ++++++++++------------- 3 files changed, 98 insertions(+), 116 deletions(-) diff --git a/aphrodite/modeling/layers/linear.py b/aphrodite/modeling/layers/linear.py index a9add9035..bf4ee67a1 100644 --- a/aphrodite/modeling/layers/linear.py +++ b/aphrodite/modeling/layers/linear.py @@ -26,7 +26,8 @@ WEIGHT_LOADER_V2_SUPPORTED = [ "CompressedTensorsLinearMethod", "GPTQMarlinLinearMethod", "AWQMarlinLinearMethod", "AWQLinearMethod", "HQQMarlinMethod", - "Fp8LinearMethod", "MarlinLinearMethod" + "Fp8LinearMethod", "MarlinLinearMethod", "QQQLinearMethod", + "GPTQMarlin24LinearMethod", ] diff --git a/aphrodite/quantization/gptq_marlin_24.py b/aphrodite/quantization/gptq_marlin_24.py index 0c5c289e4..f56d0fb63 100644 --- a/aphrodite/quantization/gptq_marlin_24.py +++ b/aphrodite/quantization/gptq_marlin_24.py @@ -6,7 +6,10 @@ from aphrodite import _custom_ops as ops from aphrodite.modeling.layers.linear import LinearBase, LinearMethodBase -from aphrodite.modeling.utils import set_weight_attrs +from aphrodite.modeling.parameter import (BaseAphroditeParameter, + ChannelQuantScaleParameter, + GroupQuantScaleParameter, + PackedAphroditeParameter) from aphrodite.quantization.base_config import QuantizationConfig from aphrodite.scalar_type import scalar_types @@ -146,6 +149,7 @@ def create_weights( **extra_weight_attrs, ): del output_size # Unused. + weight_loader = extra_weight_attrs["weight_loader"] if params_dtype != torch.float16: raise ValueError( @@ -184,87 +188,79 @@ def create_weights( "Each permutation group must reside on the same gpu") # Quantized 4Bit weights packed into Int32. - qweight = Parameter( - torch.empty( + qweight = PackedAphroditeParameter( + data=torch.empty( input_size_per_partition // self.quant_config.tile_size // 2, output_size_per_partition * self.quant_config.tile_size // self.quant_config.pack_factor, device="cuda", dtype=torch.int32, ), - requires_grad=False, - ) - set_weight_attrs( - qweight, - { - "input_dim": 0, - "output_dim": 1, - "packed_dim": 1, - "pack_factor": self.quant_config.pack_factor, - "marlin_tile_size": self.quant_config.tile_size, - }, - ) + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + marlin_tile_size=self.quant_config.tile_size, + weight_loader=weight_loader) # Meta - meta = Parameter( - torch.empty( - input_size_per_partition // 8 // 2 // 2, - output_size_per_partition * 2, - device="cuda", - dtype=torch.int16, - ), - requires_grad=False, - ) - set_weight_attrs( - meta, - { - "input_dim": 0, - "packed_dim": 1, - "pack_factor": 1, - "output_dim": 1, - "marlin_tile_size": 2, - }, - ) + meta = PackedAphroditeParameter(data=torch.empty( + input_size_per_partition // 8 // 2 // 2, + output_size_per_partition * 2, + device="cuda", + dtype=torch.int16, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=1, + marlin_tile_size=2, + weight_loader=weight_loader) # Determine if channelwise or not input_groups = (1 if self.quant_config.group_size == -1 else input_size_per_partition // self.quant_config.group_size) - scales = Parameter( + weight_scale_args = { + "data": torch.empty( input_groups, output_size_per_partition, device="cuda", dtype=params_dtype, ), - requires_grad=False, - ) - set_weight_attrs( - scales, - { - "input_dim": None if input_groups == 1 else 0, - "output_dim": 1, - }, - ) + "weight_loader": + weight_loader + } + if input_groups == 1: + scales = ChannelQuantScaleParameter(output_dim=1, + **weight_scale_args) + else: + scales = GroupQuantScaleParameter(output_dim=1, + input_dim=0, + **weight_scale_args) # Allocate workspace (Used for internal locking mechanism) max_workspace_size = ( output_size_per_partition // self.quant_config.min_n_threads) * self.quant_config.max_parallel - workspace = Parameter(torch.zeros(max_workspace_size, - device="cuda", - dtype=torch.int), - requires_grad=False) + workspace = BaseAphroditeParameter(data=torch.zeros(max_workspace_size, + device="cuda", + dtype=torch.int), + weight_loader=weight_loader) layer.register_parameter("B_24", qweight) - set_weight_attrs(qweight, extra_weight_attrs) layer.register_parameter("B_meta", meta) - set_weight_attrs(meta, extra_weight_attrs) layer.register_parameter("s", scales) - set_weight_attrs(scales, extra_weight_attrs) layer.register_parameter("workspace", workspace) - set_weight_attrs(workspace, extra_weight_attrs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # required by torch.compile + layer.B_24 = Parameter(layer.B_24.data, requires_grad=False) + layer.s = Parameter(layer.s.data, requires_grad=False) + layer.B_meta = Parameter(layer.B_meta.data, requires_grad=False) + layer.workspace = Parameter(layer.workspace.data, requires_grad=False) def apply( self, diff --git a/aphrodite/quantization/qqq.py b/aphrodite/quantization/qqq.py index 07af53777..110904ed3 100644 --- a/aphrodite/quantization/qqq.py +++ b/aphrodite/quantization/qqq.py @@ -5,7 +5,10 @@ from aphrodite import _custom_ops as ops from aphrodite.modeling.layers.linear import LinearBase, LinearMethodBase -from aphrodite.modeling.utils import set_weight_attrs +from aphrodite.modeling.parameter import (BaseAphroditeParameter, + ChannelQuantScaleParameter, + GroupQuantScaleParameter, + PackedAphroditeParameter) from aphrodite.quantization.base_config import QuantizationConfig MARLIN_QQQ_TILE = 16 @@ -128,6 +131,7 @@ def create_weights( params_dtype: torch.dtype, **extra_weight_attrs, ): + weight_loader = extra_weight_attrs["weight_loader"] if params_dtype != torch.float16: raise ValueError( f"The params dtype must be float16, but got {params_dtype}") @@ -165,90 +169,71 @@ def create_weights( "Each permutation group must reside on the same gpu") # Quantized 4Bit weights packed into Int32. - qweight = Parameter( - torch.empty( + qweight = PackedAphroditeParameter( + data=torch.empty( input_size_per_partition // self.quant_config.tile_size, output_size_per_partition * self.quant_config.tile_size // self.quant_config.pack_factor, device="cuda", dtype=torch.int32, ), - requires_grad=False, - ) - set_weight_attrs( - qweight, - { - "input_dim": 0, - "output_dim": 1, - "packed_dim": 1, - "pack_factor": self.quant_config.pack_factor, - "marlin_tile_size": self.quant_config.tile_size, - }, - ) - - s_channel = Parameter( - torch.empty( - 1, - output_size_per_partition, - device="cuda", - dtype=torch.float, - ), - requires_grad=False, - ) - set_weight_attrs( - s_channel, - { - "input_dim": None, - "output_dim": 1, - }, - ) + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + marlin_tile_size=self.quant_config.tile_size, + weight_loader=weight_loader) + s_channel = ChannelQuantScaleParameter(data=torch.empty( + 1, + output_size_per_partition, + device="cuda", + dtype=torch.float, + ), + weight_loader=weight_loader, + output_dim=1) if self.quant_config.group_size == -1: - s_group = Parameter( - torch.tensor( - [], - device="cuda", - dtype=torch.half, - ), - requires_grad=False, + s_group_data = torch.tensor( + [], + device="cuda", + dtype=torch.half, ) else: - s_group = Parameter( - torch.empty( - input_size_per_partition // self.quant_config.group_size, - output_size_per_partition, - device="cuda", - dtype=torch.half, - ), - requires_grad=False, + s_group_data = torch.empty( + input_size_per_partition // self.quant_config.group_size, + output_size_per_partition, + device="cuda", + dtype=torch.half, ) - set_weight_attrs( - s_group, - { - "input_dim": None if self.quant_config.group_size == -1 else 0, - "output_dim": - None if self.quant_config.group_size == -1 else 1, - }, - ) + s_group_attr = {"data": s_group_data, "weight_loader": weight_loader} + if self.quant_config.group_size == -1: + s_group = BaseAphroditeParameter(**s_group_attr) + else: + s_group = GroupQuantScaleParameter(output_dim=1, + input_dim=0, + **s_group_attr) # Allocate workspace (Used for internal locking mechanism) max_workspace_size = ( output_size_per_partition // self.quant_config.min_n_threads) * self.quant_config.max_parallel - workspace = Parameter(torch.zeros(max_workspace_size, - device="cuda", - dtype=torch.int), - requires_grad=False) + workspace = BaseAphroditeParameter(data=torch.zeros(max_workspace_size, + device="cuda", + dtype=torch.int), + weight_loader=weight_loader) layer.register_parameter("B", qweight) - set_weight_attrs(qweight, extra_weight_attrs) layer.register_parameter("s_channel", s_channel) - set_weight_attrs(s_channel, extra_weight_attrs) layer.register_parameter("s_group", s_group) - set_weight_attrs(s_group, extra_weight_attrs) layer.register_parameter("workspace", workspace) - set_weight_attrs(workspace, extra_weight_attrs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # required by torch.compile + layer.B = Parameter(layer.B.data, requires_grad=False) + layer.s_channel = Parameter(layer.s_channel.data, requires_grad=False) + layer.s_group = Parameter(layer.s_group.data, requires_grad=False) + layer.workspace = Parameter(layer.workspace.data, requires_grad=False) def apply( self,