From dad6a8acea2efecf9ac322973879a1c7070499e6 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Wed, 13 Dec 2023 09:12:35 -0500 Subject: [PATCH] add `modules_in_block_to_quantize` arg for gptq (#1585) * add inside_layer_modules * fix typing * add test * fix style * remove print * change naming * fix docstring * fix * Update optimum/gptq/quantizer.py Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com> * change name again with felix suggestion * fix log * style * remove print --------- Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com> --- optimum/gptq/quantizer.py | 32 ++++++++++++++++++++++++++------ tests/gptq/test_quantization.py | 16 ++++++++++++++++ 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 1a3d4b8702c..a088859ee9d 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -77,6 +77,7 @@ def __init__( exllama_config: Dict[str, Any] = None, max_input_length: Optional[int] = None, cache_block_outputs: Optional[bool] = True, + modules_in_block_to_quantize: Optional[List[List[str]]] = None, *args, **kwargs, ): @@ -106,7 +107,7 @@ def __init__( model_seqlen (`Optional[int]`, defaults to `None`): The maximum sequence length that the model can take. block_name_to_quantize (`Optional[str]`, defaults to `None`): - The transformers block name to quantize. + The transformers block name to quantize. If None, we will infer the block name using common patterns (e.g. model.layers) module_name_preceding_first_block (`Optional[List[str]]`, defaults to `None`): The layers that are preceding the first Transformer block. batch_size (`int`, defaults to `1`): @@ -123,6 +124,10 @@ def __init__( cache_block_outputs (`bool`, defaults to `True`): Whether to cache block outputs to reuse as inputs for the succeeding block. It allows optimization of non-standard models (e.g. ChatGLM) but can require more time. + modules_in_block_to_quantize (`Optional[List[List[str]]]`, defaults to `None`): + List list of module names to quantize in the block specified. This argument is useful to exclude certain linear modules from being quantized. + The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially. + If not set, we will quantize all linear layers. Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]` """ self.bits = bits @@ -143,6 +148,7 @@ def __init__( self.max_input_length = max_input_length self.quant_method = QuantizationMethod.GPTQ self.cache_block_outputs = cache_block_outputs + self.modules_in_block_to_quantize = modules_in_block_to_quantize self.serialization_keys = [ "bits", @@ -153,6 +159,7 @@ def __init__( "sym", "true_sequential", "quant_method", + "modules_in_block_to_quantize", ] if self.bits not in [2, 3, 4, 8]: @@ -210,8 +217,15 @@ def convert_model(self, model: nn.Module): self.block_name_to_quantize = get_block_name_with_pattern(model) block_name = self.block_name_to_quantize layers_to_be_replaced = get_layers(model, prefix=block_name) + if self.modules_in_block_to_quantize is not None: + layers_to_keep = sum(self.modules_in_block_to_quantize, []) + for name in list(layers_to_be_replaced.keys()): + if not any(name.endswith(layer) for layer in layers_to_keep): + logger.info( + f"Quantization disabled for {name} (only modules_in_block_to_quantize={self.modules_in_block_to_quantize} are quantized)" + ) + del layers_to_be_replaced[name] self._replace_by_quant_layers(model, layers_to_be_replaced) - return model def get_no_split_module_classes(self, model): @@ -444,11 +458,17 @@ def store_input_hook(_, input, *args): if not has_device_map or get_device(block) == torch.device("cpu"): block = block.to(0) layers = get_layers(block) - if self.true_sequential: - # lazy sequential but works well - layers_name_list = [[key] for key in layers.keys()] + if isinstance(self.modules_in_block_to_quantize, list) and len(self.modules_in_block_to_quantize) > 0: + if self.true_sequential: + layers_name_list = self.modules_in_block_to_quantize + else: + layers_name_list = [sum(self.modules_in_block_to_quantize, [])] else: - layers_name_list = [list(layers.keys())] + if self.true_sequential: + # lazy sequential but works well + layers_name_list = [[key] for key in layers.keys()] + else: + layers_name_list = [list(layers.keys())] logger.info(f"Module to quantize {layers_name_list}") for subset_name_list in tqdm(layers_name_list, leave=False, desc="Quantizing layers inside the block"): subset_layers = {name: layers[name] for name in subset_name_list} diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py index a24b3e683c6..325fb9c0cfe 100644 --- a/tests/gptq/test_quantization.py +++ b/tests/gptq/test_quantization.py @@ -53,6 +53,7 @@ class GPTQTest(unittest.TestCase): disable_exllama = True exllama_config = None cache_block_outputs = True + modules_to_quantize_inside_block = None dataset = [ "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." @@ -78,6 +79,7 @@ def setUpClass(cls): disable_exllama=cls.disable_exllama, exllama_config=cls.exllama_config, cache_block_outputs=cls.cache_block_outputs, + modules_to_quantize_inside_block=cls.modules_to_quantize_inside_block, ) cls.quantized_model = cls.quantizer.quantize_model(cls.model_fp16, cls.tokenizer) @@ -300,6 +302,20 @@ class GPTQTestNoBlockCaching(GPTQTest): EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of") +class GPTQTestModuleQuant(GPTQTest): + # all layers are quantized apart from self_attention.dense + modules_in_block_to_quantize = [ + ["self_attention.query_key_value"], + ["mlp.dense_h_to_4h"], + ["mlp.dense_4h_to_h"], + ] + EXPECTED_RELATIVE_DIFFERENCE = 1.57705236164535 + + def test_not_converted_layers(self): + # self_attention.dense should not be converted + self.assertTrue(self.quantized_model.transformer.h[0].self_attention.dense.__class__.__name__ == "Linear") + + class GPTQUtilsTest(unittest.TestCase): """ Test utilities