Skip to content

Commit

Permalink
add modules_in_block_to_quantize arg for gptq (#1585)
Browse files Browse the repository at this point in the history
* add inside_layer_modules

* fix typing

* add test

* fix style

* remove print

* change naming

* fix docstring

* fix

* Update optimum/gptq/quantizer.py

Co-authored-by: fxmarty <[email protected]>

* change name again with felix suggestion

* fix log

* style

* remove print

---------

Co-authored-by: fxmarty <[email protected]>
  • Loading branch information
SunMarc and fxmarty authored Dec 13, 2023
1 parent be87d2c commit dad6a8a
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 6 deletions.
32 changes: 26 additions & 6 deletions optimum/gptq/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def __init__(
exllama_config: Dict[str, Any] = None,
max_input_length: Optional[int] = None,
cache_block_outputs: Optional[bool] = True,
modules_in_block_to_quantize: Optional[List[List[str]]] = None,
*args,
**kwargs,
):
Expand Down Expand Up @@ -106,7 +107,7 @@ def __init__(
model_seqlen (`Optional[int]`, defaults to `None`):
The maximum sequence length that the model can take.
block_name_to_quantize (`Optional[str]`, defaults to `None`):
The transformers block name to quantize.
The transformers block name to quantize. If None, we will infer the block name using common patterns (e.g. model.layers)
module_name_preceding_first_block (`Optional[List[str]]`, defaults to `None`):
The layers that are preceding the first Transformer block.
batch_size (`int`, defaults to `1`):
Expand All @@ -123,6 +124,10 @@ def __init__(
cache_block_outputs (`bool`, defaults to `True`):
Whether to cache block outputs to reuse as inputs for the succeeding block. It allows optimization of non-standard models
(e.g. ChatGLM) but can require more time.
modules_in_block_to_quantize (`Optional[List[List[str]]]`, defaults to `None`):
List list of module names to quantize in the block specified. This argument is useful to exclude certain linear modules from being quantized.
The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially.
If not set, we will quantize all linear layers. Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]`
"""

self.bits = bits
Expand All @@ -143,6 +148,7 @@ def __init__(
self.max_input_length = max_input_length
self.quant_method = QuantizationMethod.GPTQ
self.cache_block_outputs = cache_block_outputs
self.modules_in_block_to_quantize = modules_in_block_to_quantize

self.serialization_keys = [
"bits",
Expand All @@ -153,6 +159,7 @@ def __init__(
"sym",
"true_sequential",
"quant_method",
"modules_in_block_to_quantize",
]

if self.bits not in [2, 3, 4, 8]:
Expand Down Expand Up @@ -210,8 +217,15 @@ def convert_model(self, model: nn.Module):
self.block_name_to_quantize = get_block_name_with_pattern(model)
block_name = self.block_name_to_quantize
layers_to_be_replaced = get_layers(model, prefix=block_name)
if self.modules_in_block_to_quantize is not None:
layers_to_keep = sum(self.modules_in_block_to_quantize, [])
for name in list(layers_to_be_replaced.keys()):
if not any(name.endswith(layer) for layer in layers_to_keep):
logger.info(
f"Quantization disabled for {name} (only modules_in_block_to_quantize={self.modules_in_block_to_quantize} are quantized)"
)
del layers_to_be_replaced[name]
self._replace_by_quant_layers(model, layers_to_be_replaced)

return model

def get_no_split_module_classes(self, model):
Expand Down Expand Up @@ -444,11 +458,17 @@ def store_input_hook(_, input, *args):
if not has_device_map or get_device(block) == torch.device("cpu"):
block = block.to(0)
layers = get_layers(block)
if self.true_sequential:
# lazy sequential but works well
layers_name_list = [[key] for key in layers.keys()]
if isinstance(self.modules_in_block_to_quantize, list) and len(self.modules_in_block_to_quantize) > 0:
if self.true_sequential:
layers_name_list = self.modules_in_block_to_quantize
else:
layers_name_list = [sum(self.modules_in_block_to_quantize, [])]
else:
layers_name_list = [list(layers.keys())]
if self.true_sequential:
# lazy sequential but works well
layers_name_list = [[key] for key in layers.keys()]
else:
layers_name_list = [list(layers.keys())]
logger.info(f"Module to quantize {layers_name_list}")
for subset_name_list in tqdm(layers_name_list, leave=False, desc="Quantizing layers inside the block"):
subset_layers = {name: layers[name] for name in subset_name_list}
Expand Down
16 changes: 16 additions & 0 deletions tests/gptq/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class GPTQTest(unittest.TestCase):
disable_exllama = True
exllama_config = None
cache_block_outputs = True
modules_to_quantize_inside_block = None

dataset = [
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
Expand All @@ -78,6 +79,7 @@ def setUpClass(cls):
disable_exllama=cls.disable_exllama,
exllama_config=cls.exllama_config,
cache_block_outputs=cls.cache_block_outputs,
modules_to_quantize_inside_block=cls.modules_to_quantize_inside_block,
)

cls.quantized_model = cls.quantizer.quantize_model(cls.model_fp16, cls.tokenizer)
Expand Down Expand Up @@ -300,6 +302,20 @@ class GPTQTestNoBlockCaching(GPTQTest):
EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")


class GPTQTestModuleQuant(GPTQTest):
# all layers are quantized apart from self_attention.dense
modules_in_block_to_quantize = [
["self_attention.query_key_value"],
["mlp.dense_h_to_4h"],
["mlp.dense_4h_to_h"],
]
EXPECTED_RELATIVE_DIFFERENCE = 1.57705236164535

def test_not_converted_layers(self):
# self_attention.dense should not be converted
self.assertTrue(self.quantized_model.transformer.h[0].self_attention.dense.__class__.__name__ == "Linear")


class GPTQUtilsTest(unittest.TestCase):
"""
Test utilities
Expand Down

0 comments on commit dad6a8a

Please sign in to comment.