From dad6a8acea2efecf9ac322973879a1c7070499e6 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Wed, 13 Dec 2023 09:12:35 -0500
Subject: [PATCH] add `modules_in_block_to_quantize` arg for gptq (#1585)

* add inside_layer_modules

* fix typing

* add test

* fix style

* remove print

* change naming

* fix docstring

* fix

* Update optimum/gptq/quantizer.py

Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com>

* change name again with felix suggestion

* fix log

* style

* remove print

---------

Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com>
---
 optimum/gptq/quantizer.py       | 32 ++++++++++++++++++++++++++------
 tests/gptq/test_quantization.py | 16 ++++++++++++++++
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index 1a3d4b8702c..a088859ee9d 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -77,6 +77,7 @@ def __init__(
         exllama_config: Dict[str, Any] = None,
         max_input_length: Optional[int] = None,
         cache_block_outputs: Optional[bool] = True,
+        modules_in_block_to_quantize: Optional[List[List[str]]] = None,
         *args,
         **kwargs,
     ):
@@ -106,7 +107,7 @@ def __init__(
             model_seqlen (`Optional[int]`, defaults to `None`):
                 The maximum sequence length that the model can take.
             block_name_to_quantize (`Optional[str]`, defaults to `None`):
-                The transformers block name to quantize.
+                The transformers block name to quantize. If None, we will infer the block name using common patterns (e.g. model.layers)
             module_name_preceding_first_block (`Optional[List[str]]`, defaults to `None`):
                 The layers that are preceding the first Transformer block.
             batch_size (`int`, defaults to `1`):
@@ -123,6 +124,10 @@ def __init__(
             cache_block_outputs (`bool`, defaults to `True`):
                 Whether to cache block outputs to reuse as inputs for the succeeding block. It allows optimization of non-standard models
                 (e.g. ChatGLM) but can require more time.
+            modules_in_block_to_quantize (`Optional[List[List[str]]]`, defaults to `None`):
+                List list of module names to quantize in the block specified. This argument is useful to exclude certain linear modules from being quantized.
+                The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially.
+                If not set, we will quantize all linear layers. Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]`
         """
 
         self.bits = bits
@@ -143,6 +148,7 @@ def __init__(
         self.max_input_length = max_input_length
         self.quant_method = QuantizationMethod.GPTQ
         self.cache_block_outputs = cache_block_outputs
+        self.modules_in_block_to_quantize = modules_in_block_to_quantize
 
         self.serialization_keys = [
             "bits",
@@ -153,6 +159,7 @@ def __init__(
             "sym",
             "true_sequential",
             "quant_method",
+            "modules_in_block_to_quantize",
         ]
 
         if self.bits not in [2, 3, 4, 8]:
@@ -210,8 +217,15 @@ def convert_model(self, model: nn.Module):
             self.block_name_to_quantize = get_block_name_with_pattern(model)
         block_name = self.block_name_to_quantize
         layers_to_be_replaced = get_layers(model, prefix=block_name)
+        if self.modules_in_block_to_quantize is not None:
+            layers_to_keep = sum(self.modules_in_block_to_quantize, [])
+            for name in list(layers_to_be_replaced.keys()):
+                if not any(name.endswith(layer) for layer in layers_to_keep):
+                    logger.info(
+                        f"Quantization disabled for {name} (only modules_in_block_to_quantize={self.modules_in_block_to_quantize} are quantized)"
+                    )
+                    del layers_to_be_replaced[name]
         self._replace_by_quant_layers(model, layers_to_be_replaced)
-
         return model
 
     def get_no_split_module_classes(self, model):
@@ -444,11 +458,17 @@ def store_input_hook(_, input, *args):
             if not has_device_map or get_device(block) == torch.device("cpu"):
                 block = block.to(0)
             layers = get_layers(block)
-            if self.true_sequential:
-                # lazy sequential but works well
-                layers_name_list = [[key] for key in layers.keys()]
+            if isinstance(self.modules_in_block_to_quantize, list) and len(self.modules_in_block_to_quantize) > 0:
+                if self.true_sequential:
+                    layers_name_list = self.modules_in_block_to_quantize
+                else:
+                    layers_name_list = [sum(self.modules_in_block_to_quantize, [])]
             else:
-                layers_name_list = [list(layers.keys())]
+                if self.true_sequential:
+                    # lazy sequential but works well
+                    layers_name_list = [[key] for key in layers.keys()]
+                else:
+                    layers_name_list = [list(layers.keys())]
             logger.info(f"Module to quantize {layers_name_list}")
             for subset_name_list in tqdm(layers_name_list, leave=False, desc="Quantizing layers inside the block"):
                 subset_layers = {name: layers[name] for name in subset_name_list}
diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py
index a24b3e683c6..325fb9c0cfe 100644
--- a/tests/gptq/test_quantization.py
+++ b/tests/gptq/test_quantization.py
@@ -53,6 +53,7 @@ class GPTQTest(unittest.TestCase):
     disable_exllama = True
     exllama_config = None
     cache_block_outputs = True
+    modules_to_quantize_inside_block = None
 
     dataset = [
         "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
@@ -78,6 +79,7 @@ def setUpClass(cls):
             disable_exllama=cls.disable_exllama,
             exllama_config=cls.exllama_config,
             cache_block_outputs=cls.cache_block_outputs,
+            modules_to_quantize_inside_block=cls.modules_to_quantize_inside_block,
         )
 
         cls.quantized_model = cls.quantizer.quantize_model(cls.model_fp16, cls.tokenizer)
@@ -300,6 +302,20 @@ class GPTQTestNoBlockCaching(GPTQTest):
     EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")
 
 
+class GPTQTestModuleQuant(GPTQTest):
+    # all layers are quantized apart from self_attention.dense
+    modules_in_block_to_quantize = [
+        ["self_attention.query_key_value"],
+        ["mlp.dense_h_to_4h"],
+        ["mlp.dense_4h_to_h"],
+    ]
+    EXPECTED_RELATIVE_DIFFERENCE = 1.57705236164535
+
+    def test_not_converted_layers(self):
+        # self_attention.dense should not be converted
+        self.assertTrue(self.quantized_model.transformer.h[0].self_attention.dense.__class__.__name__ == "Linear")
+
+
 class GPTQUtilsTest(unittest.TestCase):
     """
     Test utilities