From 5db25419bf89819ea6a84528e84d737c4ffdae49 Mon Sep 17 00:00:00 2001
From: Marc Sun <marc.sun@hotmail.fr>
Date: Wed, 27 Sep 2023 11:19:13 +0000
Subject: [PATCH 01/19] add_exllamav2

---
 .../usage_guides/quantization.mdx             | 15 ++++++-
 optimum/gptq/quantizer.py                     | 45 ++++++++++++++-----
 tests/gptq/test_quantization.py               | 45 +++++++++++++++++--
 3 files changed, 89 insertions(+), 16 deletions(-)

diff --git a/docs/source/llm_quantization/usage_guides/quantization.mdx b/docs/source/llm_quantization/usage_guides/quantization.mdx
index 2ec8d1f6683..5b669841829 100644
--- a/docs/source/llm_quantization/usage_guides/quantization.mdx
+++ b/docs/source/llm_quantization/usage_guides/quantization.mdx
@@ -89,8 +89,21 @@ empty_model.tie_weights()
 quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False)
 ```
 
-Note that only 4-bit models are supported with exllama kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft.
+With the release of the exllamav2 kernel, you can get faster inference speed compared to the exllama kernels. You just need to 
+pass `disable_exllamav2` in [`~optimum.gptq.load_quantized_model`]:
 
+```py
+from optimum.gptq import GPTQQuantizer, load_quantized_model
+import torch
+
+from accelerate import init_empty_weights
+with init_empty_weights():
+    empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
+empty_model.tie_weights()
+quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllamav2=False)
+```
+
+Note that only 4-bit models are supported with exllama/exllamav2 kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft.
 #### Fine-tune a quantized model
 
 With the official support of adapters in the Hugging Face ecosystem, you can fine-tune models that have been quantized with GPTQ.
diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index 70e8dfa954e..ae975c9b8e0 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -69,7 +69,8 @@ def __init__(
         module_name_preceding_first_block: Optional[List[str]] = None,
         batch_size: int = 1,
         pad_token_id: Optional[int] = None,
-        disable_exllama: bool = False,
+        disable_exllama: bool = True,
+        disable_exllamav2: bool = False, 
         max_input_length: Optional[int] = None,
         *args,
         **kwargs,
@@ -107,8 +108,10 @@ def __init__(
                 The batch size of the dataset
             pad_token_id (`Optional[int]`, defaults to `None`):
                 The pad token id. Needed to prepare the dataset when `batch_size` > 1.
-            disable_exllama (`bool`, defaults to `False`):
+            disable_exllama (`bool`, defaults to `True`):
                 Whether to use exllama backend. Only works with `bits` = 4.
+            disable_exllamav2 (`bool`, defaults to `False`):
+                Whether to use exllamav2 backend. Only works with `bits` = 4.
             max_input_length (`Optional[int]`, defaults to `None`):
                 The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
                 It is specific to the exllama backend with act-order.
@@ -128,6 +131,7 @@ def __init__(
         self.batch_size = batch_size
         self.pad_token_id = pad_token_id
         self.disable_exllama = disable_exllama
+        self.disable_exllamav2 = disable_exllamav2
         self.max_input_length = max_input_length
         self.quant_method = QuantizationMethod.GPTQ
 
@@ -137,6 +141,9 @@ def __init__(
             raise ValueError("group_size must be greater than 0 or equal to -1")
         if not (0 < self.damp_percent < 1):
             raise ValueError("damp_percent must between 0 and 1.")
+        if not self.disable_exllamav2 and not self.disable_exllama:
+            logger.warning("You have activated exllama and exllamav2 backend. Setting `disable_exllama=True` and keeping `disable_exllamav2=False`")
+            self.disable_exllama=True
 
     def to_dict(self):
         """
@@ -205,6 +212,7 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st
             group_size=self.group_size,
             bits=self.bits,
             disable_exllama=self.disable_exllama,
+            disable_exllamav2=self.disable_exllamav2,
         )
         if isinstance(module, QuantLinear):
             return
@@ -440,13 +448,21 @@ def tmp(_, input, output):
             layer_inputs, layer_outputs = layer_outputs, []
             torch.cuda.empty_cache()
 
-        if self.bits == 4 and not self.disable_exllama:
+        if self.bits == 4:
+            # device not on gpu
             if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
-                logger.warning(
-                    "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
-                )
-                self.disable_exllama = True
-            elif self.desc_act:
+                if not self.disable_exllama:
+                    logger.warning(
+                        "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
+                    )
+                    self.disable_exllama = True
+                if not self.disable_exllamav2:
+                    logger.warning(
+                        "Found modules on cpu/disk. Using Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllamav2=True`"
+                    )
+                    self.disable_exllamav2 = True
+            # act order and exllama
+            elif self.desc_act and not self.disable_exllama:
                 logger.warning(
                     "Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights."
                     "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. "
@@ -475,13 +491,13 @@ def post_init_model(self, model):
             model (`nn.Module`):
                 The input model
         """
-        if self.bits == 4 and not self.disable_exllama:
+        if self.bits == 4 and (not self.disable_exllama or not self.disable_exllamav2):
             if get_device(model) == torch.device("cpu") or (
                 hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"])
             ):
                 raise ValueError(
-                    "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU."
-                    "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
+                    "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU."
+                    "You can deactivate exllama backend by setting `disable_exllama=True` or `disable_exllamav2=True` in the quantization config object"
                 )
 
         class StoreAttr(object):
@@ -514,6 +530,7 @@ def pack_model(
             group_size=self.group_size,
             bits=self.bits,
             disable_exllama=self.disable_exllama,
+            disable_exllamav2=self.disable_exllamav2,
         )
         logger.info("Packing model...")
         layers = get_layers(model)
@@ -579,7 +596,8 @@ def load_quantized_model(
     offload_folder: Optional[str] = None,
     offload_buffers: Optional[str] = None,
     offload_state_dict: bool = False,
-    disable_exllama: bool = False,
+    disable_exllama: bool = True,
+    disable_exllamav2: bool = False,
     max_input_length: Optional[int] = None,
 ):
     """
@@ -615,6 +633,8 @@ def load_quantized_model(
             picked contains `"disk"` values.
         disable_exllama (`bool`, defaults to `False`):
             Whether to use exllama backend. Only works with `bits` = 4.
+        disable_exllama (`bool`, defaults to `False`):
+            Whether to use exllamav2 backend. Only works with `bits` = 4.
         max_input_length (`Optional[int]`, defaults to `None`):
             The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
             It is specific to the exllama backend with act-order.
@@ -648,6 +668,7 @@ def load_quantized_model(
         ) from err
     quantizer = GPTQQuantizer.from_dict(quantize_config_dict)
     quantizer.disable_exllama = disable_exllama
+    quantizer.disable_exllamav2 = disable_exllamav2
     quantizer.max_input_length = max_input_length
 
     model = quantizer.convert_model(model)
diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py
index 53ff1a722e5..ad3e36be73a 100644
--- a/tests/gptq/test_quantization.py
+++ b/tests/gptq/test_quantization.py
@@ -46,6 +46,7 @@ class GPTQTest(unittest.TestCase):
     group_size = 128
     desc_act = False
     disable_exllama = True
+    disable_exllamav2 = True
 
     dataset = [
         "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
@@ -69,6 +70,7 @@ def setUpClass(cls):
             group_size=cls.group_size,
             desc_act=cls.desc_act,
             disable_exllama=cls.disable_exllama,
+            disable_exllamav2=cls.disable_exllamav2,
         )
 
         cls.quantized_model = cls.quantizer.quantize_model(cls.model_fp16, cls.tokenizer)
@@ -96,6 +98,7 @@ def test_quantized_layers_class(self):
             group_size=self.group_size,
             bits=self.bits,
             disable_exllama=self.disable_exllama,
+            disable_exllamav2=self.disable_exllamav2,
         )
         self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear)
 
@@ -133,13 +136,14 @@ def test_serialization(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=self.disable_exllama
+                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=self.disable_exllama, disable_exllamav2=self.disable_exllamav2
             )
             self.check_inference_correctness(quantized_model_from_saved)
 
 
 class GPTQTestExllama(GPTQTest):
     disable_exllama = False
+    disable_exllamav2 = True
     EXPECTED_OUTPUTS = set()
     EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
     EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.")
@@ -153,6 +157,7 @@ class GPTQTestActOrder(GPTQTest):
     EXPECTED_OUTPUTS.add("Hello my name is nathalie, I am a young girl from")
 
     disable_exllama = True
+    disable_exllamav2 = True
     desc_act = True
 
     def test_generate_quality(self):
@@ -178,7 +183,7 @@ def test_exllama_serialization(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False
+                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, disable_exllamav2=True
             )
             self.check_inference_correctness(quantized_model_from_saved)
 
@@ -197,7 +202,7 @@ def test_exllama_max_input_length(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, max_input_length=4028
+                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, max_input_length=4028, disable_exllamav2=True
             )
 
             prompt = "I am in Paris and" * 1000
@@ -213,6 +218,40 @@ def test_exllama_max_input_length(self):
             quantized_model_from_saved.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
 
 
+    
+class GPTQTestExllamav2(GPTQTest):
+    desc_act = False
+    disable_exllama = True
+    disable_exllamav2 = True
+    
+    def test_generate_quality(self):
+        # don't need to test
+        pass
+
+    def test_serialization(self):
+        # don't need to test
+        pass
+
+    def test_exllama_serialization(self):
+        """
+        Test the serialization of the model and the loading of the quantized weights with exllamav2 kernel
+        """
+        from accelerate import init_empty_weights
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.quantizer.save(self.quantized_model, tmpdirname)
+            self.quantized_model.config.save_pretrained(tmpdirname)
+            with init_empty_weights():
+                empty_model = AutoModelForCausalLM.from_config(
+                    AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16
+                )
+            empty_model.tie_weights()
+            quantized_model_from_saved = load_quantized_model(
+                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllamav2=False,
+            )
+            self.check_inference_correctness(quantized_model_from_saved)
+
+            
 class GPTQUtilsTest(unittest.TestCase):
     """
     Test utilities

From 03441b8acab14e03ed322154dfd0984cb4b1699c Mon Sep 17 00:00:00 2001
From: Marc Sun <marc.sun@hotmail.fr>
Date: Wed, 27 Sep 2023 11:50:45 +0000
Subject: [PATCH 02/19] style

---
 optimum/gptq/quantizer.py       |  8 +++++---
 tests/gptq/test_quantization.py | 23 +++++++++++++++++------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index ae975c9b8e0..6ed928f10fd 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -70,7 +70,7 @@ def __init__(
         batch_size: int = 1,
         pad_token_id: Optional[int] = None,
         disable_exllama: bool = True,
-        disable_exllamav2: bool = False, 
+        disable_exllamav2: bool = False,
         max_input_length: Optional[int] = None,
         *args,
         **kwargs,
@@ -142,8 +142,10 @@ def __init__(
         if not (0 < self.damp_percent < 1):
             raise ValueError("damp_percent must between 0 and 1.")
         if not self.disable_exllamav2 and not self.disable_exllama:
-            logger.warning("You have activated exllama and exllamav2 backend. Setting `disable_exllama=True` and keeping `disable_exllamav2=False`")
-            self.disable_exllama=True
+            logger.warning(
+                "You have activated exllama and exllamav2 backend. Setting `disable_exllama=True` and keeping `disable_exllamav2=False`"
+            )
+            self.disable_exllama = True
 
     def to_dict(self):
         """
diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py
index ad3e36be73a..946b21e90bd 100644
--- a/tests/gptq/test_quantization.py
+++ b/tests/gptq/test_quantization.py
@@ -136,7 +136,11 @@ def test_serialization(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=self.disable_exllama, disable_exllamav2=self.disable_exllamav2
+                empty_model,
+                save_folder=tmpdirname,
+                device_map={"": 0},
+                disable_exllama=self.disable_exllama,
+                disable_exllamav2=self.disable_exllamav2,
             )
             self.check_inference_correctness(quantized_model_from_saved)
 
@@ -202,7 +206,12 @@ def test_exllama_max_input_length(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, max_input_length=4028, disable_exllamav2=True
+                empty_model,
+                save_folder=tmpdirname,
+                device_map={"": 0},
+                disable_exllama=False,
+                max_input_length=4028,
+                disable_exllamav2=True,
             )
 
             prompt = "I am in Paris and" * 1000
@@ -218,12 +227,11 @@ def test_exllama_max_input_length(self):
             quantized_model_from_saved.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
 
 
-    
 class GPTQTestExllamav2(GPTQTest):
     desc_act = False
     disable_exllama = True
     disable_exllamav2 = True
-    
+
     def test_generate_quality(self):
         # don't need to test
         pass
@@ -247,11 +255,14 @@ def test_exllama_serialization(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllamav2=False,
+                empty_model,
+                save_folder=tmpdirname,
+                device_map={"": 0},
+                disable_exllamav2=False,
             )
             self.check_inference_correctness(quantized_model_from_saved)
 
-            
+
 class GPTQUtilsTest(unittest.TestCase):
     """
     Test utilities

From 80d085e239540f103c8695618df6811ab592a40f Mon Sep 17 00:00:00 2001
From: Marc Sun <marc.sun@hotmail.fr>
Date: Wed, 27 Sep 2023 14:21:06 +0000
Subject: [PATCH 03/19] fix doc

---
 docs/source/llm_quantization/usage_guides/quantization.mdx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/llm_quantization/usage_guides/quantization.mdx b/docs/source/llm_quantization/usage_guides/quantization.mdx
index 5b669841829..ece7a69be92 100644
--- a/docs/source/llm_quantization/usage_guides/quantization.mdx
+++ b/docs/source/llm_quantization/usage_guides/quantization.mdx
@@ -86,7 +86,7 @@ from accelerate import init_empty_weights
 with init_empty_weights():
     empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
 empty_model.tie_weights()
-quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False)
+quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False, disable_exllamav2=True)
 ```
 
 With the release of the exllamav2 kernel, you can get faster inference speed compared to the exllama kernels. You just need to 
@@ -104,6 +104,9 @@ quantized_model = load_quantized_model(empty_model, save_folder=save_folder, dev
 ```
 
 Note that only 4-bit models are supported with exllama/exllamav2 kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft.
+
+You can find the benchmark of these kernels [here](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)
+
 #### Fine-tune a quantized model
 
 With the official support of adapters in the Hugging Face ecosystem, you can fine-tune models that have been quantized with GPTQ.

From 0c53c2f4a9334e32b1606997281528faa31c0f63 Mon Sep 17 00:00:00 2001
From: Marc Sun <marc.sun@hotmail.fr>
Date: Wed, 27 Sep 2023 16:03:14 +0000
Subject: [PATCH 04/19] simplify script

---
 tests/benchmark/benchmark_gptq.py | 258 ++++++++++++++++--------------
 1 file changed, 139 insertions(+), 119 deletions(-)

diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py
index 06af05056a3..369e6922e8e 100644
--- a/tests/benchmark/benchmark_gptq.py
+++ b/tests/benchmark/benchmark_gptq.py
@@ -1,12 +1,10 @@
 import argparse
 import gc
-import json
 import os
 import time
 
 import numpy as np
 import torch
-from accelerate import init_empty_weights
 from memory_tracker import MemoryTracker
 from tqdm import tqdm
 from transformers import (
@@ -16,11 +14,11 @@
     AutoTokenizer,
     BitsAndBytesConfig,
     GenerationConfig,
+    GPTQConfig
 )
 
 from optimum.exporters import TasksManager
-from optimum.gptq import load_quantized_model
-
+from auto_gptq.utils import Perplexity
 
 def get_parser():
     parser = argparse.ArgumentParser()
@@ -45,13 +43,7 @@ def get_parser():
     parser.add_argument(
         "--model",
         type=str,
-        help="Model to benchmark (in the non-quantized case), or reference architecture corresponding to the quantized model (GPTQ case)",
-    )
-    parser.add_argument(
-        "--gptq-model",
-        type=str,
-        default=None,
-        help="Path to a local GPTQ model.",
+        help="Model to benchmark",
     )
     parser.add_argument(
         "--prompt-length",
@@ -90,6 +82,27 @@ def get_parser():
         action="store_true",
         help="Disable Exllama kernel, to rather use the AutoGPTQ CUDA (act-order case) or CUDA-old (no act-order case) kernels.",
     )
+    parser.add_argument(
+        "--disable-exllamav2",
+        action="store_true",
+        help="Disable Exllama kernel, to rather use the AutoGPTQ CUDA (act-order case) or CUDA-old (no act-order case) kernels.",
+    )
+    parser.add_argument(
+        "--generate",
+        action="store_true",
+        help="Calculate the generate speed (prompt processing + token generation)",
+    )
+    parser.add_argument(
+        "--ppl",
+        action="store_true",
+        help="Calculate the perplexity on wikitext2 dataset",
+    )
+    parser.add_argument(
+        "--revision",
+        default=None,
+        help="Revision of the model to benchmark",
+    )
+    
     return parser
 
 
@@ -266,7 +279,7 @@ def benchmark_memory(
 device = torch.device("cuda:0")
 memory_tracker = MemoryTracker()
 
-tokenizer = AutoTokenizer.from_pretrained(args.model)
+tokenizer = AutoTokenizer.from_pretrained(args.model,revision=args.revision, use_fast=False)
 
 if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
@@ -288,46 +301,14 @@ def benchmark_memory(
 else:
     is_decoder = False
 
-act_order = None
-bits = None
-group_size = None
-kernel = None
-if args.gptq:
-    if not args.gptq_model:
-        raise ValueError("The argument --gptq-model needs to be provided when benchmarking GPTQ.")
-
-    with open(os.path.join(args.gptq_model, "quantization_config.json"), "r", encoding="utf-8") as f:
-        quantize_config_dict = json.load(f)
-
-        act_order = quantize_config_dict["desc_act"]
-        bits = quantize_config_dict["bits"]
-        group_size = quantize_config_dict["group_size"]
-
-        if not args.disable_exllama:
-            # Exllama kernel can handle both the act-order / no act-order cases.
-            kernel = "exllama"
-        elif act_order:
-            kernel = "autotogptq-cuda"
-        else:
-            kernel = "autogptq-cuda-old"
-
 load_start = time.time_ns()
 if args.gptq:
-    with init_empty_weights():
-        empty_model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.float16)
-    empty_model.tie_weights()
-    model = load_quantized_model(
-        empty_model,
-        save_folder=args.gptq_model,
-        state_dict_name="model.safetensors",
-        device_map="auto",
-        disable_exllama=args.disable_exllama,
-    )
+    quantization_config = GPTQConfig(bits=4, disable_exllama=args.disable_exllama, disable_exllamav2=args.disable_exllamav2)
+    model = autoclass.from_pretrained(args.model,revision=args.revision, quantization_config=quantization_config, torch_dtype=torch.float16, device_map="auto")
 elif args.bitsandbytes:
     quantization_config = BitsAndBytesConfig(
         load_in_4bit=True, bnb_4bit_quant_type="fp4", bnb_4bit_compute_dtype=torch.float16
     )
-
     model = autoclass.from_pretrained(
         args.model, quantization_config=quantization_config, device_map="auto", torch_dtype=torch.float16
     )
@@ -337,6 +318,27 @@ def benchmark_memory(
 torch.cuda.synchronize()
 load_end = time.time_ns()
 
+act_order = None
+bits = None
+group_size = None
+kernel = None
+
+if args.gptq:
+    quantization_config_dict = model.config.quantization_config.to_dict()
+    act_order = quantization_config_dict["desc_act"]
+    bits = quantization_config_dict["bits"]
+    group_size = quantization_config_dict["group_size"]
+
+    if not args.disable_exllamav2:
+        kernel = "exllamav2"
+    elif not args.disable_exllama:
+        # Exllama kernel can handle both the act-order / no act-order cases.
+        kernel = "exllama"
+    elif act_order:
+        kernel = "autotogptq-cuda"
+    else:
+        kernel = "autogptq-cuda-old"
+        
 load_time = (load_end - load_start) * 1e-9
 print(f"Model load time: {load_time:.1f} s")
 
@@ -364,82 +366,100 @@ def benchmark_memory(
     file_name = file_name + "_noquant"
     quantization = None
 
-file_name = file_name + ".csv"
-output_file = open(file_name, "w")
-header = "quantization, act_order, bits, group_size, kernel, num_batches, batch_size, prompt_length, new_tokens, Load time (s), Per-token latency (ms), Throughput (tok/s), Max memory (MB)\n"
-output_file.write(header)
-
-latencies = {}
-throughputs = {}
-all_max_mem = {}
-print(
-    "WARNING: The reported peak memory is only a rough estimate, and can NOT be precisely relied upon to estimate an OOM limit."
-)
-
-for batch_size in tqdm(batch_sizes):
-    for prompt_length in tqdm(prompt_lengths):
-        for new_token in tqdm(new_tokens):
-            print(f"---- Running: batch_size={batch_size}, prompt_length={prompt_length}, new_tokens={new_token}")
-
-            torch.cuda.empty_cache()
-            torch.cuda.reset_peak_memory_stats()
-
-            input_ids = torch.randint(1, model.config.vocab_size - 1, size=(batch_size, prompt_length)).to(device)
-            masks = torch.ones(batch_size, prompt_length, dtype=torch.int32).to(device)
+if args.ppl:
+    output_file = open(file_name + "_perplexity.csv", "w")
+    header = "quantization, act_order, bits, group_size, kernel, perplexity\n"
+    output_file.write(header)
+    ppl = Perplexity(model, tokenizer)
+    ppl_value = np.mean(ppl.calculate_perplexity())
+    line = "{},{},{},{},{},{}\n".format(
+        quantization,
+        act_order,
+        bits,
+        group_size,
+        kernel,
+        f"{ppl_value:.2f}",
+    )
+    print(header)
+    print(line)
+    output_file.write(line)
+    output_file.close()
+
+if args.generate:
+    output_file = open(file_name + ".csv", "w")
+    header = "quantization, act_order, bits, group_size, kernel, num_batches, batch_size, prompt_length, new_tokens, Load time (s), Per-token latency (ms), Throughput (tok/s), Max memory (MB)\n"
+    output_file.write(header)
+
+    latencies = {}
+    throughputs = {}
+    all_max_mem = {}
+    print(
+        "WARNING: The reported peak memory is only a rough estimate, and can NOT be precisely relied upon to estimate an OOM limit."
+    )
 
-            with torch.no_grad():
-                max_mem = benchmark_memory(
-                    model,
-                    input_ids,
-                    masks,
-                    args.num_batches,
-                    is_decoder,
-                    new_token,
-                    tokenizer.pad_token_id,
-                    memory_tracker=memory_tracker,
+    for batch_size in tqdm(batch_sizes):
+        for prompt_length in tqdm(prompt_lengths):
+            for new_token in tqdm(new_tokens):
+                print(f"---- Running: batch_size={batch_size}, prompt_length={prompt_length}, new_tokens={new_token}")
+
+                torch.cuda.empty_cache()
+                torch.cuda.reset_peak_memory_stats()
+
+                input_ids = torch.randint(1, model.config.vocab_size - 1, size=(batch_size, prompt_length)).to(device)
+                masks = torch.ones(batch_size, prompt_length, dtype=torch.int32).to(device)
+
+                with torch.no_grad():
+                    max_mem = benchmark_memory(
+                        model,
+                        input_ids,
+                        masks,
+                        args.num_batches,
+                        is_decoder,
+                        new_token,
+                        tokenizer.pad_token_id,
+                        memory_tracker=memory_tracker,
+                    )
+
+                    mean_latency = benchmark_latency(
+                        model,
+                        input_ids,
+                        masks,
+                        args.num_batches,
+                        is_decoder,
+                        new_token,
+                        tokenizer.pad_token_id,
+                        memory_tracker=memory_tracker,
+                    )
+
+                index = (batch_size, prompt_length, new_token)
+
+                per_token_latency = mean_latency / new_token
+                latencies[index] = per_token_latency
+
+                throughput = batch_size / (per_token_latency * 1e-3)
+                throughputs[index] = throughput
+                all_max_mem[index] = max_mem
+
+                print(
+                    f"Latency per token: {per_token_latency:.3f} ms, throughput: {throughput:.3f} tok/s, peak mem: {max_mem:.2f} MB"
                 )
 
-                mean_latency = benchmark_latency(
-                    model,
-                    input_ids,
-                    masks,
+                line = "{},{},{},{},{},{},{},{},{},{},{},{},{}\n".format(
+                    quantization,
+                    act_order,
+                    bits,
+                    group_size,
+                    kernel,
                     args.num_batches,
-                    is_decoder,
+                    batch_size,
+                    prompt_length,
                     new_token,
-                    tokenizer.pad_token_id,
-                    memory_tracker=memory_tracker,
+                    f"{load_time:.2f}",
+                    f"{per_token_latency:.2f}",
+                    f"{throughput:.2f}",
+                    f"{max_mem:.2f}",
                 )
-
-            index = (batch_size, prompt_length, new_token)
-
-            per_token_latency = mean_latency / new_token
-            latencies[index] = per_token_latency
-
-            throughput = batch_size / (per_token_latency * 1e-3)
-            throughputs[index] = throughput
-            all_max_mem[index] = max_mem
-
-            print(
-                f"Latency per token: {per_token_latency:.3f} ms, throughput: {throughput:.3f} tok/s, peak mem: {max_mem:.2f} MB"
-            )
-
-            line = "{},{},{},{},{},{},{},{},{},{},{},{},{}\n".format(
-                quantization,
-                act_order,
-                bits,
-                group_size,
-                kernel,
-                args.num_batches,
-                batch_size,
-                prompt_length,
-                new_token,
-                f"{load_time:.2f}",
-                f"{per_token_latency:.2f}",
-                f"{throughput:.2f}",
-                f"{max_mem:.2f}",
-            )
-            print(header)
-            print(line)
-            output_file.write(line)
-
-output_file.close()
+                print(header)
+                print(line)
+                output_file.write(line)
+    output_file.close()

From 216213e46e094de9d72614c09b058dceb1b35020 Mon Sep 17 00:00:00 2001
From: Marc Sun <marc.sun@hotmail.fr>
Date: Wed, 27 Sep 2023 16:18:06 +0000
Subject: [PATCH 05/19] style

---
 tests/benchmark/benchmark_gptq.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py
index 369e6922e8e..45fdc262cee 100644
--- a/tests/benchmark/benchmark_gptq.py
+++ b/tests/benchmark/benchmark_gptq.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import torch
+from auto_gptq.utils import Perplexity
 from memory_tracker import MemoryTracker
 from tqdm import tqdm
 from transformers import (
@@ -14,11 +15,11 @@
     AutoTokenizer,
     BitsAndBytesConfig,
     GenerationConfig,
-    GPTQConfig
+    GPTQConfig,
 )
 
 from optimum.exporters import TasksManager
-from auto_gptq.utils import Perplexity
+
 
 def get_parser():
     parser = argparse.ArgumentParser()
@@ -102,7 +103,7 @@ def get_parser():
         default=None,
         help="Revision of the model to benchmark",
     )
-    
+
     return parser
 
 
@@ -279,7 +280,7 @@ def benchmark_memory(
 device = torch.device("cuda:0")
 memory_tracker = MemoryTracker()
 
-tokenizer = AutoTokenizer.from_pretrained(args.model,revision=args.revision, use_fast=False)
+tokenizer = AutoTokenizer.from_pretrained(args.model, revision=args.revision, use_fast=False)
 
 if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
@@ -303,8 +304,16 @@ def benchmark_memory(
 
 load_start = time.time_ns()
 if args.gptq:
-    quantization_config = GPTQConfig(bits=4, disable_exllama=args.disable_exllama, disable_exllamav2=args.disable_exllamav2)
-    model = autoclass.from_pretrained(args.model,revision=args.revision, quantization_config=quantization_config, torch_dtype=torch.float16, device_map="auto")
+    quantization_config = GPTQConfig(
+        bits=4, disable_exllama=args.disable_exllama, disable_exllamav2=args.disable_exllamav2
+    )
+    model = autoclass.from_pretrained(
+        args.model,
+        revision=args.revision,
+        quantization_config=quantization_config,
+        torch_dtype=torch.float16,
+        device_map="auto",
+    )
 elif args.bitsandbytes:
     quantization_config = BitsAndBytesConfig(
         load_in_4bit=True, bnb_4bit_quant_type="fp4", bnb_4bit_compute_dtype=torch.float16
@@ -338,7 +347,7 @@ def benchmark_memory(
         kernel = "autotogptq-cuda"
     else:
         kernel = "autogptq-cuda-old"
-        
+
 load_time = (load_end - load_start) * 1e-9
 print(f"Model load time: {load_time:.1f} s")
 

From dadc6dc9e3f4001dfd55c68fef0acc028ffdbe79 Mon Sep 17 00:00:00 2001
From: Marc Sun <marc.sun@hotmail.fr>
Date: Wed, 27 Sep 2023 16:54:58 +0000
Subject: [PATCH 06/19] update perplexity measure

---
 tests/benchmark/README.md | 59 +++++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 14 deletions(-)

diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md
index e1fb1f01dd5..ea6dedc1523 100644
--- a/tests/benchmark/README.md
+++ b/tests/benchmark/README.md
@@ -11,22 +11,20 @@ The results below are for AutoGPTQ 0.4.2, PyTorch 2.0.1, bitsandbytes 0.41.1, tr
 Run
 
 ```shell
-git clone --branch main https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ
-cd Llama-2-13B-chat-GPTQ
-mv gptq_model-4bit-128g.safetensors model.safetensors
-mv quantize_config.json quantization_config.json
-
 # pytorch fp16
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation --generate
+
+# GPTQ with exllamav2 kernel (int4/fp16)
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --generate
 
 # GPTQ with exllama kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model /path/to/Llama-2-13B-chat-GPTQ/ --sweep --num-batches 4 --gptq --task text-generation
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --disable-exllamav2 --task text-generation --generate
 
 # GPTQ without exllama kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model /path/to/Llama-2-13B-chat-GPTQ/ --sweep --num-batches 4 --gptq --task text-generation --disable-exllama
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --disable-exllama  --disable-exllamav2 --generate
 
 # using bitsandbytes fp4/fp16 scheme
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation --bitsandbytes
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation --bitsandbytes --generate
 ```
 
 Here are results obtained on a single NVIDIA A100-SXM4-80GB GPU. We use a prompt length of 512, and generate exactly 512 new tokens. Each generation is repeated for 4 batches, and metrics are averaged over the number of batches and generation length.
@@ -88,16 +86,20 @@ Run
 
 ```shell
 # pytorch fp16
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --generate
+
+# GPTQ with exllamav2 kernel (int4/fp16)
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --generate
+
+# GPTQ with exllamav kernel (int4/fp16)
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --disable-exllamav2 --sweep --num-batches 10 --gptq --task text-generation --prefill --generate
 
-# GPTQ with exllama kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model ../../../Llama-2-13B-chat-GPTQ/ --sweep --num-batches 10 --gptq --task text-generation --prefill
 
 # GPTQ without exllama kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model ../../../Llama-2-13B-chat-GPTQ/ --sweep --num-batches 10 --gptq --task text-generation --prefill --disable-exllama
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --disable-exllama --disable-exllamav2 --generate
 
 # using bitsandbytes fp4/fp16 scheme
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --bitsandbytes
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --bitsandbytes --generate
 ```
 
 The benchmark below is for a prompt length of 512, measuring only the prefill step on a single NVIDIA A100-SXM4-80GB GPU. The forward is repeated 10 times. This benchmark typically corresponds to the forward during training (to the difference that here `generate` is called, which has some overhead).
@@ -146,3 +148,32 @@ The benchmark below is for a prompt length of 512, measuring only the prefill st
 |gptq |False    |4   |128       |exllama          |10         |16        |512          |1         |38.35        |1280.25               |12.50             |17203.22       |
 |gptq |False    |4   |128       |autogptq-cuda-old|10         |16        |512          |1         |43.94        |1533.54               |10.43             |17060.76       |
 |bitsandbytes|None|None|None|None|512|1  |37.46|1256.88|12.73|17737.95|
+
+## Perplexity benchmark results
+
+Run
+
+```shell
+# pytorch fp16
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --task text-generation --ppl
+
+# GPTQ with exllamav2 kernel (int4/fp16)
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --ppl
+
+# GPTQ with exllama kernel (int4/fp16)
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --disable-exllamav2 --task text-generation --ppl
+
+# GPTQ without exllama kernel (int4/fp16)
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --disable-exllama  --disable-exllamav2 --ppl
+
+# using bitsandbytes fp4/fp16 scheme
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf ---task text-generation --bitsandbytes --ppl
+```
+
+| quantization | act_order | bits | group_size | kernel           | perplexity |
+|--------------|-----------|------|------------|------------------|------------|
+| None         | None      | None | None       | None             | 6.61       |
+| gptq         | True      | 4    | 128        | exllamav2        | 6.77       |
+| gptq         | True      | 4    | 128        | exllama          | 6.77       |
+| gptq         | True      | 4    | 128        | autogptq-cuda-old| 6.77       |
+| bitsandbytes | None      | 4    | None       | None             | 6.78       |
\ No newline at end of file

From cf4019d685e133f6c6b36d55176f85494336c487 Mon Sep 17 00:00:00 2001
From: Marc Sun <marc.sun@hotmail.fr>
Date: Wed, 27 Sep 2023 17:01:37 +0000
Subject: [PATCH 07/19] Revert "Merge branch 'add_exllamav2' into
 update-benchmark-gptq"

This reverts commit f2dbdc2ea13183c353dfa22135d2a7f401a3dbbb, reversing
changes made to 216213e46e094de9d72614c09b058dceb1b35020.
---
 .../usage_guides/quantization.mdx             | 20 +------
 optimum/gptq/quantizer.py                     | 47 ++++------------
 tests/gptq/test_quantization.py               | 56 +------------------
 3 files changed, 17 insertions(+), 106 deletions(-)

diff --git a/docs/source/llm_quantization/usage_guides/quantization.mdx b/docs/source/llm_quantization/usage_guides/quantization.mdx
index ece7a69be92..2ec8d1f6683 100644
--- a/docs/source/llm_quantization/usage_guides/quantization.mdx
+++ b/docs/source/llm_quantization/usage_guides/quantization.mdx
@@ -86,26 +86,10 @@ from accelerate import init_empty_weights
 with init_empty_weights():
     empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
 empty_model.tie_weights()
-quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False, disable_exllamav2=True)
+quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False)
 ```
 
-With the release of the exllamav2 kernel, you can get faster inference speed compared to the exllama kernels. You just need to 
-pass `disable_exllamav2` in [`~optimum.gptq.load_quantized_model`]:
-
-```py
-from optimum.gptq import GPTQQuantizer, load_quantized_model
-import torch
-
-from accelerate import init_empty_weights
-with init_empty_weights():
-    empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
-empty_model.tie_weights()
-quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllamav2=False)
-```
-
-Note that only 4-bit models are supported with exllama/exllamav2 kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft.
-
-You can find the benchmark of these kernels [here](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)
+Note that only 4-bit models are supported with exllama kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft.
 
 #### Fine-tune a quantized model
 
diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index 6ed928f10fd..70e8dfa954e 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -69,8 +69,7 @@ def __init__(
         module_name_preceding_first_block: Optional[List[str]] = None,
         batch_size: int = 1,
         pad_token_id: Optional[int] = None,
-        disable_exllama: bool = True,
-        disable_exllamav2: bool = False,
+        disable_exllama: bool = False,
         max_input_length: Optional[int] = None,
         *args,
         **kwargs,
@@ -108,10 +107,8 @@ def __init__(
                 The batch size of the dataset
             pad_token_id (`Optional[int]`, defaults to `None`):
                 The pad token id. Needed to prepare the dataset when `batch_size` > 1.
-            disable_exllama (`bool`, defaults to `True`):
+            disable_exllama (`bool`, defaults to `False`):
                 Whether to use exllama backend. Only works with `bits` = 4.
-            disable_exllamav2 (`bool`, defaults to `False`):
-                Whether to use exllamav2 backend. Only works with `bits` = 4.
             max_input_length (`Optional[int]`, defaults to `None`):
                 The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
                 It is specific to the exllama backend with act-order.
@@ -131,7 +128,6 @@ def __init__(
         self.batch_size = batch_size
         self.pad_token_id = pad_token_id
         self.disable_exllama = disable_exllama
-        self.disable_exllamav2 = disable_exllamav2
         self.max_input_length = max_input_length
         self.quant_method = QuantizationMethod.GPTQ
 
@@ -141,11 +137,6 @@ def __init__(
             raise ValueError("group_size must be greater than 0 or equal to -1")
         if not (0 < self.damp_percent < 1):
             raise ValueError("damp_percent must between 0 and 1.")
-        if not self.disable_exllamav2 and not self.disable_exllama:
-            logger.warning(
-                "You have activated exllama and exllamav2 backend. Setting `disable_exllama=True` and keeping `disable_exllamav2=False`"
-            )
-            self.disable_exllama = True
 
     def to_dict(self):
         """
@@ -214,7 +205,6 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st
             group_size=self.group_size,
             bits=self.bits,
             disable_exllama=self.disable_exllama,
-            disable_exllamav2=self.disable_exllamav2,
         )
         if isinstance(module, QuantLinear):
             return
@@ -450,21 +440,13 @@ def tmp(_, input, output):
             layer_inputs, layer_outputs = layer_outputs, []
             torch.cuda.empty_cache()
 
-        if self.bits == 4:
-            # device not on gpu
+        if self.bits == 4 and not self.disable_exllama:
             if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
-                if not self.disable_exllama:
-                    logger.warning(
-                        "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
-                    )
-                    self.disable_exllama = True
-                if not self.disable_exllamav2:
-                    logger.warning(
-                        "Found modules on cpu/disk. Using Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllamav2=True`"
-                    )
-                    self.disable_exllamav2 = True
-            # act order and exllama
-            elif self.desc_act and not self.disable_exllama:
+                logger.warning(
+                    "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
+                )
+                self.disable_exllama = True
+            elif self.desc_act:
                 logger.warning(
                     "Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights."
                     "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. "
@@ -493,13 +475,13 @@ def post_init_model(self, model):
             model (`nn.Module`):
                 The input model
         """
-        if self.bits == 4 and (not self.disable_exllama or not self.disable_exllamav2):
+        if self.bits == 4 and not self.disable_exllama:
             if get_device(model) == torch.device("cpu") or (
                 hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"])
             ):
                 raise ValueError(
-                    "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU."
-                    "You can deactivate exllama backend by setting `disable_exllama=True` or `disable_exllamav2=True` in the quantization config object"
+                    "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU."
+                    "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
                 )
 
         class StoreAttr(object):
@@ -532,7 +514,6 @@ def pack_model(
             group_size=self.group_size,
             bits=self.bits,
             disable_exllama=self.disable_exllama,
-            disable_exllamav2=self.disable_exllamav2,
         )
         logger.info("Packing model...")
         layers = get_layers(model)
@@ -598,8 +579,7 @@ def load_quantized_model(
     offload_folder: Optional[str] = None,
     offload_buffers: Optional[str] = None,
     offload_state_dict: bool = False,
-    disable_exllama: bool = True,
-    disable_exllamav2: bool = False,
+    disable_exllama: bool = False,
     max_input_length: Optional[int] = None,
 ):
     """
@@ -635,8 +615,6 @@ def load_quantized_model(
             picked contains `"disk"` values.
         disable_exllama (`bool`, defaults to `False`):
             Whether to use exllama backend. Only works with `bits` = 4.
-        disable_exllama (`bool`, defaults to `False`):
-            Whether to use exllamav2 backend. Only works with `bits` = 4.
         max_input_length (`Optional[int]`, defaults to `None`):
             The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
             It is specific to the exllama backend with act-order.
@@ -670,7 +648,6 @@ def load_quantized_model(
         ) from err
     quantizer = GPTQQuantizer.from_dict(quantize_config_dict)
     quantizer.disable_exllama = disable_exllama
-    quantizer.disable_exllamav2 = disable_exllamav2
     quantizer.max_input_length = max_input_length
 
     model = quantizer.convert_model(model)
diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py
index 946b21e90bd..53ff1a722e5 100644
--- a/tests/gptq/test_quantization.py
+++ b/tests/gptq/test_quantization.py
@@ -46,7 +46,6 @@ class GPTQTest(unittest.TestCase):
     group_size = 128
     desc_act = False
     disable_exllama = True
-    disable_exllamav2 = True
 
     dataset = [
         "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
@@ -70,7 +69,6 @@ def setUpClass(cls):
             group_size=cls.group_size,
             desc_act=cls.desc_act,
             disable_exllama=cls.disable_exllama,
-            disable_exllamav2=cls.disable_exllamav2,
         )
 
         cls.quantized_model = cls.quantizer.quantize_model(cls.model_fp16, cls.tokenizer)
@@ -98,7 +96,6 @@ def test_quantized_layers_class(self):
             group_size=self.group_size,
             bits=self.bits,
             disable_exllama=self.disable_exllama,
-            disable_exllamav2=self.disable_exllamav2,
         )
         self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear)
 
@@ -136,18 +133,13 @@ def test_serialization(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model,
-                save_folder=tmpdirname,
-                device_map={"": 0},
-                disable_exllama=self.disable_exllama,
-                disable_exllamav2=self.disable_exllamav2,
+                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=self.disable_exllama
             )
             self.check_inference_correctness(quantized_model_from_saved)
 
 
 class GPTQTestExllama(GPTQTest):
     disable_exllama = False
-    disable_exllamav2 = True
     EXPECTED_OUTPUTS = set()
     EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
     EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.")
@@ -161,7 +153,6 @@ class GPTQTestActOrder(GPTQTest):
     EXPECTED_OUTPUTS.add("Hello my name is nathalie, I am a young girl from")
 
     disable_exllama = True
-    disable_exllamav2 = True
     desc_act = True
 
     def test_generate_quality(self):
@@ -187,7 +178,7 @@ def test_exllama_serialization(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, disable_exllamav2=True
+                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False
             )
             self.check_inference_correctness(quantized_model_from_saved)
 
@@ -206,12 +197,7 @@ def test_exllama_max_input_length(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model,
-                save_folder=tmpdirname,
-                device_map={"": 0},
-                disable_exllama=False,
-                max_input_length=4028,
-                disable_exllamav2=True,
+                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, max_input_length=4028
             )
 
             prompt = "I am in Paris and" * 1000
@@ -227,42 +213,6 @@ def test_exllama_max_input_length(self):
             quantized_model_from_saved.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
 
 
-class GPTQTestExllamav2(GPTQTest):
-    desc_act = False
-    disable_exllama = True
-    disable_exllamav2 = True
-
-    def test_generate_quality(self):
-        # don't need to test
-        pass
-
-    def test_serialization(self):
-        # don't need to test
-        pass
-
-    def test_exllama_serialization(self):
-        """
-        Test the serialization of the model and the loading of the quantized weights with exllamav2 kernel
-        """
-        from accelerate import init_empty_weights
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            self.quantizer.save(self.quantized_model, tmpdirname)
-            self.quantized_model.config.save_pretrained(tmpdirname)
-            with init_empty_weights():
-                empty_model = AutoModelForCausalLM.from_config(
-                    AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16
-                )
-            empty_model.tie_weights()
-            quantized_model_from_saved = load_quantized_model(
-                empty_model,
-                save_folder=tmpdirname,
-                device_map={"": 0},
-                disable_exllamav2=False,
-            )
-            self.check_inference_correctness(quantized_model_from_saved)
-
-
 class GPTQUtilsTest(unittest.TestCase):
     """
     Test utilities

From 97a7c62b0cf09ad4671a4198958977143a1191cf Mon Sep 17 00:00:00 2001
From: Marc Sun <marc.sun@hotmail.fr>
Date: Wed, 27 Sep 2023 16:20:52 +0000
Subject: [PATCH 08/19] Merge branch 'add_exllamav2' into update-benchmark-gptq

---
 .../usage_guides/quantization.mdx             | 20 ++++++-
 optimum/gptq/quantizer.py                     | 47 ++++++++++++----
 tests/gptq/test_quantization.py               | 56 ++++++++++++++++++-
 3 files changed, 106 insertions(+), 17 deletions(-)

diff --git a/docs/source/llm_quantization/usage_guides/quantization.mdx b/docs/source/llm_quantization/usage_guides/quantization.mdx
index 2ec8d1f6683..ece7a69be92 100644
--- a/docs/source/llm_quantization/usage_guides/quantization.mdx
+++ b/docs/source/llm_quantization/usage_guides/quantization.mdx
@@ -86,10 +86,26 @@ from accelerate import init_empty_weights
 with init_empty_weights():
     empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
 empty_model.tie_weights()
-quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False)
+quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False, disable_exllamav2=True)
 ```
 
-Note that only 4-bit models are supported with exllama kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft.
+With the release of the exllamav2 kernel, you can get faster inference speed compared to the exllama kernels. You just need to 
+pass `disable_exllamav2` in [`~optimum.gptq.load_quantized_model`]:
+
+```py
+from optimum.gptq import GPTQQuantizer, load_quantized_model
+import torch
+
+from accelerate import init_empty_weights
+with init_empty_weights():
+    empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
+empty_model.tie_weights()
+quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllamav2=False)
+```
+
+Note that only 4-bit models are supported with exllama/exllamav2 kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft.
+
+You can find the benchmark of these kernels [here](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)
 
 #### Fine-tune a quantized model
 
diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index 70e8dfa954e..6ed928f10fd 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -69,7 +69,8 @@ def __init__(
         module_name_preceding_first_block: Optional[List[str]] = None,
         batch_size: int = 1,
         pad_token_id: Optional[int] = None,
-        disable_exllama: bool = False,
+        disable_exllama: bool = True,
+        disable_exllamav2: bool = False,
         max_input_length: Optional[int] = None,
         *args,
         **kwargs,
@@ -107,8 +108,10 @@ def __init__(
                 The batch size of the dataset
             pad_token_id (`Optional[int]`, defaults to `None`):
                 The pad token id. Needed to prepare the dataset when `batch_size` > 1.
-            disable_exllama (`bool`, defaults to `False`):
+            disable_exllama (`bool`, defaults to `True`):
                 Whether to use exllama backend. Only works with `bits` = 4.
+            disable_exllamav2 (`bool`, defaults to `False`):
+                Whether to use exllamav2 backend. Only works with `bits` = 4.
             max_input_length (`Optional[int]`, defaults to `None`):
                 The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
                 It is specific to the exllama backend with act-order.
@@ -128,6 +131,7 @@ def __init__(
         self.batch_size = batch_size
         self.pad_token_id = pad_token_id
         self.disable_exllama = disable_exllama
+        self.disable_exllamav2 = disable_exllamav2
         self.max_input_length = max_input_length
         self.quant_method = QuantizationMethod.GPTQ
 
@@ -137,6 +141,11 @@ def __init__(
             raise ValueError("group_size must be greater than 0 or equal to -1")
         if not (0 < self.damp_percent < 1):
             raise ValueError("damp_percent must between 0 and 1.")
+        if not self.disable_exllamav2 and not self.disable_exllama:
+            logger.warning(
+                "You have activated exllama and exllamav2 backend. Setting `disable_exllama=True` and keeping `disable_exllamav2=False`"
+            )
+            self.disable_exllama = True
 
     def to_dict(self):
         """
@@ -205,6 +214,7 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st
             group_size=self.group_size,
             bits=self.bits,
             disable_exllama=self.disable_exllama,
+            disable_exllamav2=self.disable_exllamav2,
         )
         if isinstance(module, QuantLinear):
             return
@@ -440,13 +450,21 @@ def tmp(_, input, output):
             layer_inputs, layer_outputs = layer_outputs, []
             torch.cuda.empty_cache()
 
-        if self.bits == 4 and not self.disable_exllama:
+        if self.bits == 4:
+            # device not on gpu
             if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
-                logger.warning(
-                    "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
-                )
-                self.disable_exllama = True
-            elif self.desc_act:
+                if not self.disable_exllama:
+                    logger.warning(
+                        "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
+                    )
+                    self.disable_exllama = True
+                if not self.disable_exllamav2:
+                    logger.warning(
+                        "Found modules on cpu/disk. Using Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllamav2=True`"
+                    )
+                    self.disable_exllamav2 = True
+            # act order and exllama
+            elif self.desc_act and not self.disable_exllama:
                 logger.warning(
                     "Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights."
                     "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. "
@@ -475,13 +493,13 @@ def post_init_model(self, model):
             model (`nn.Module`):
                 The input model
         """
-        if self.bits == 4 and not self.disable_exllama:
+        if self.bits == 4 and (not self.disable_exllama or not self.disable_exllamav2):
             if get_device(model) == torch.device("cpu") or (
                 hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"])
             ):
                 raise ValueError(
-                    "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU."
-                    "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
+                    "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU."
+                    "You can deactivate exllama backend by setting `disable_exllama=True` or `disable_exllamav2=True` in the quantization config object"
                 )
 
         class StoreAttr(object):
@@ -514,6 +532,7 @@ def pack_model(
             group_size=self.group_size,
             bits=self.bits,
             disable_exllama=self.disable_exllama,
+            disable_exllamav2=self.disable_exllamav2,
         )
         logger.info("Packing model...")
         layers = get_layers(model)
@@ -579,7 +598,8 @@ def load_quantized_model(
     offload_folder: Optional[str] = None,
     offload_buffers: Optional[str] = None,
     offload_state_dict: bool = False,
-    disable_exllama: bool = False,
+    disable_exllama: bool = True,
+    disable_exllamav2: bool = False,
     max_input_length: Optional[int] = None,
 ):
     """
@@ -615,6 +635,8 @@ def load_quantized_model(
             picked contains `"disk"` values.
         disable_exllama (`bool`, defaults to `False`):
             Whether to use exllama backend. Only works with `bits` = 4.
+        disable_exllama (`bool`, defaults to `False`):
+            Whether to use exllamav2 backend. Only works with `bits` = 4.
         max_input_length (`Optional[int]`, defaults to `None`):
             The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
             It is specific to the exllama backend with act-order.
@@ -648,6 +670,7 @@ def load_quantized_model(
         ) from err
     quantizer = GPTQQuantizer.from_dict(quantize_config_dict)
     quantizer.disable_exllama = disable_exllama
+    quantizer.disable_exllamav2 = disable_exllamav2
     quantizer.max_input_length = max_input_length
 
     model = quantizer.convert_model(model)
diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py
index 53ff1a722e5..946b21e90bd 100644
--- a/tests/gptq/test_quantization.py
+++ b/tests/gptq/test_quantization.py
@@ -46,6 +46,7 @@ class GPTQTest(unittest.TestCase):
     group_size = 128
     desc_act = False
     disable_exllama = True
+    disable_exllamav2 = True
 
     dataset = [
         "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
@@ -69,6 +70,7 @@ def setUpClass(cls):
             group_size=cls.group_size,
             desc_act=cls.desc_act,
             disable_exllama=cls.disable_exllama,
+            disable_exllamav2=cls.disable_exllamav2,
         )
 
         cls.quantized_model = cls.quantizer.quantize_model(cls.model_fp16, cls.tokenizer)
@@ -96,6 +98,7 @@ def test_quantized_layers_class(self):
             group_size=self.group_size,
             bits=self.bits,
             disable_exllama=self.disable_exllama,
+            disable_exllamav2=self.disable_exllamav2,
         )
         self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear)
 
@@ -133,13 +136,18 @@ def test_serialization(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=self.disable_exllama
+                empty_model,
+                save_folder=tmpdirname,
+                device_map={"": 0},
+                disable_exllama=self.disable_exllama,
+                disable_exllamav2=self.disable_exllamav2,
             )
             self.check_inference_correctness(quantized_model_from_saved)
 
 
 class GPTQTestExllama(GPTQTest):
     disable_exllama = False
+    disable_exllamav2 = True
     EXPECTED_OUTPUTS = set()
     EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
     EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.")
@@ -153,6 +161,7 @@ class GPTQTestActOrder(GPTQTest):
     EXPECTED_OUTPUTS.add("Hello my name is nathalie, I am a young girl from")
 
     disable_exllama = True
+    disable_exllamav2 = True
     desc_act = True
 
     def test_generate_quality(self):
@@ -178,7 +187,7 @@ def test_exllama_serialization(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False
+                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, disable_exllamav2=True
             )
             self.check_inference_correctness(quantized_model_from_saved)
 
@@ -197,7 +206,12 @@ def test_exllama_max_input_length(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, max_input_length=4028
+                empty_model,
+                save_folder=tmpdirname,
+                device_map={"": 0},
+                disable_exllama=False,
+                max_input_length=4028,
+                disable_exllamav2=True,
             )
 
             prompt = "I am in Paris and" * 1000
@@ -213,6 +227,42 @@ def test_exllama_max_input_length(self):
             quantized_model_from_saved.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
 
 
+class GPTQTestExllamav2(GPTQTest):
+    desc_act = False
+    disable_exllama = True
+    disable_exllamav2 = True
+
+    def test_generate_quality(self):
+        # don't need to test
+        pass
+
+    def test_serialization(self):
+        # don't need to test
+        pass
+
+    def test_exllama_serialization(self):
+        """
+        Test the serialization of the model and the loading of the quantized weights with exllamav2 kernel
+        """
+        from accelerate import init_empty_weights
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            self.quantizer.save(self.quantized_model, tmpdirname)
+            self.quantized_model.config.save_pretrained(tmpdirname)
+            with init_empty_weights():
+                empty_model = AutoModelForCausalLM.from_config(
+                    AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16
+                )
+            empty_model.tie_weights()
+            quantized_model_from_saved = load_quantized_model(
+                empty_model,
+                save_folder=tmpdirname,
+                device_map={"": 0},
+                disable_exllamav2=False,
+            )
+            self.check_inference_correctness(quantized_model_from_saved)
+
+
 class GPTQUtilsTest(unittest.TestCase):
     """
     Test utilities

From 62b89d954de63e88265acdb07567ff1f68aeafad Mon Sep 17 00:00:00 2001
From: Marc Sun <marc.sun@hotmail.fr>
Date: Thu, 28 Sep 2023 15:11:17 +0000
Subject: [PATCH 09/19] fix arg in llama attention

---
 optimum/bettertransformer/models/attention.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py
index 702aca3257b..829609cdcbd 100644
--- a/optimum/bettertransformer/models/attention.py
+++ b/optimum/bettertransformer/models/attention.py
@@ -583,6 +583,7 @@ def llama_forward(
     past_key_value: Optional[Tuple[torch.Tensor]] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
+    padding_mask: Optional[torch.LongTensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     if output_attentions is True:
         raise ValueError("output_attentions=True can not be supported with BetterTransformer.")

From 1ef6ce523d2dfa07f3e2e2a1daad2f27e259e6cb Mon Sep 17 00:00:00 2001
From: Marc Sun <marc.sun@hotmail.fr>
Date: Fri, 29 Sep 2023 08:51:35 +0000
Subject: [PATCH 10/19] flash_attention arg

---
 tests/benchmark/benchmark_gptq.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py
index 45fdc262cee..57b74776ca0 100644
--- a/tests/benchmark/benchmark_gptq.py
+++ b/tests/benchmark/benchmark_gptq.py
@@ -103,6 +103,11 @@ def get_parser():
         default=None,
         help="Revision of the model to benchmark",
     )
+    parser.add_argument(
+        "--enable-flash",
+        action="store_true",
+        help="Use flash attention",
+    )
 
     return parser
 
@@ -124,7 +129,8 @@ def timing_cuda(
         start_event.record()
 
         if is_decoder:
-            _ = model.generate(input_ids, attention_mask=masks, generation_config=generation_config)
+            with torch.backends.cuda.sdp_kernel(enable_flash=args.enable_flash, enable_math=False, enable_mem_efficient=False):
+                _ = model.generate(input_ids, attention_mask=masks, generation_config=generation_config)
         else:
             _ = model(input_ids, masks)
         end_event.record()
@@ -157,7 +163,9 @@ def warmup(
             eos_token_id=None,  # This is required for min_new_tokens to actually have an effect.
         )
         model.generation_config.eos_token_id = None  # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect.
-        res = model.generate(input_ids, attention_mask=masks, generation_config=gen_config)
+        with torch.backends.cuda.sdp_kernel(enable_flash=args.enable_flash, enable_math=False, enable_mem_efficient=False):
+            print(input_ids)
+            res = model.generate(input_ids, generation_config=gen_config)
         assert res.shape[1] == new_tokens + input_ids.shape[1]
         del res
     else:
@@ -221,7 +229,8 @@ def benchmark_memory(
         )
 
         if is_decoder:
-            _ = model.generate(input_ids, attention_mask=masks, generation_config=gen_config)
+            with torch.backends.cuda.sdp_kernel(enable_flash=args.enable_flash, enable_math=False, enable_mem_efficient=False):
+                _ = model.generate(input_ids, attention_mask=masks, generation_config=gen_config)
         else:
             _ = model(input_ids, masks)
 
@@ -324,7 +333,10 @@ def benchmark_memory(
 else:
     with device:
         model = autoclass.from_pretrained(args.model, torch_dtype=torch.float16)
+        
+model.to_bettertransformer()
 torch.cuda.synchronize()
+
 load_end = time.time_ns()
 
 act_order = None
@@ -415,7 +427,8 @@ def benchmark_memory(
                 torch.cuda.reset_peak_memory_stats()
 
                 input_ids = torch.randint(1, model.config.vocab_size - 1, size=(batch_size, prompt_length)).to(device)
-                masks = torch.ones(batch_size, prompt_length, dtype=torch.int32).to(device)
+                #masks = torch.ones(batch_size, prompt_length, dtype=torch.int32).to(device)
+                masks=None
 
                 with torch.no_grad():
                     max_mem = benchmark_memory(

From f727313c2f19ae39a00b6ff6134f829c42b0b875 Mon Sep 17 00:00:00 2001
From: Marc Sun <marc.sun@hotmail.fr>
Date: Fri, 29 Sep 2023 08:55:35 +0000
Subject: [PATCH 11/19] Revert "Merge branch 'add_exllamav2' into
 update-benchmark-gptq"

This reverts commit 97a7c62b0cf09ad4671a4198958977143a1191cf.
---
 .../usage_guides/quantization.mdx             | 20 +------
 optimum/bettertransformer/models/attention.py |  1 -
 optimum/gptq/quantizer.py                     | 47 ++++------------
 tests/benchmark/benchmark_gptq.py             | 21 ++-----
 tests/gptq/test_quantization.py               | 56 +------------------
 5 files changed, 21 insertions(+), 124 deletions(-)

diff --git a/docs/source/llm_quantization/usage_guides/quantization.mdx b/docs/source/llm_quantization/usage_guides/quantization.mdx
index ece7a69be92..2ec8d1f6683 100644
--- a/docs/source/llm_quantization/usage_guides/quantization.mdx
+++ b/docs/source/llm_quantization/usage_guides/quantization.mdx
@@ -86,26 +86,10 @@ from accelerate import init_empty_weights
 with init_empty_weights():
     empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
 empty_model.tie_weights()
-quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False, disable_exllamav2=True)
+quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False)
 ```
 
-With the release of the exllamav2 kernel, you can get faster inference speed compared to the exllama kernels. You just need to 
-pass `disable_exllamav2` in [`~optimum.gptq.load_quantized_model`]:
-
-```py
-from optimum.gptq import GPTQQuantizer, load_quantized_model
-import torch
-
-from accelerate import init_empty_weights
-with init_empty_weights():
-    empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
-empty_model.tie_weights()
-quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllamav2=False)
-```
-
-Note that only 4-bit models are supported with exllama/exllamav2 kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft.
-
-You can find the benchmark of these kernels [here](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)
+Note that only 4-bit models are supported with exllama kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft.
 
 #### Fine-tune a quantized model
 
diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py
index 829609cdcbd..702aca3257b 100644
--- a/optimum/bettertransformer/models/attention.py
+++ b/optimum/bettertransformer/models/attention.py
@@ -583,7 +583,6 @@ def llama_forward(
     past_key_value: Optional[Tuple[torch.Tensor]] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
-    padding_mask: Optional[torch.LongTensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     if output_attentions is True:
         raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index 6ed928f10fd..70e8dfa954e 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -69,8 +69,7 @@ def __init__(
         module_name_preceding_first_block: Optional[List[str]] = None,
         batch_size: int = 1,
         pad_token_id: Optional[int] = None,
-        disable_exllama: bool = True,
-        disable_exllamav2: bool = False,
+        disable_exllama: bool = False,
         max_input_length: Optional[int] = None,
         *args,
         **kwargs,
@@ -108,10 +107,8 @@ def __init__(
                 The batch size of the dataset
             pad_token_id (`Optional[int]`, defaults to `None`):
                 The pad token id. Needed to prepare the dataset when `batch_size` > 1.
-            disable_exllama (`bool`, defaults to `True`):
+            disable_exllama (`bool`, defaults to `False`):
                 Whether to use exllama backend. Only works with `bits` = 4.
-            disable_exllamav2 (`bool`, defaults to `False`):
-                Whether to use exllamav2 backend. Only works with `bits` = 4.
             max_input_length (`Optional[int]`, defaults to `None`):
                 The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
                 It is specific to the exllama backend with act-order.
@@ -131,7 +128,6 @@ def __init__(
         self.batch_size = batch_size
         self.pad_token_id = pad_token_id
         self.disable_exllama = disable_exllama
-        self.disable_exllamav2 = disable_exllamav2
         self.max_input_length = max_input_length
         self.quant_method = QuantizationMethod.GPTQ
 
@@ -141,11 +137,6 @@ def __init__(
             raise ValueError("group_size must be greater than 0 or equal to -1")
         if not (0 < self.damp_percent < 1):
             raise ValueError("damp_percent must between 0 and 1.")
-        if not self.disable_exllamav2 and not self.disable_exllama:
-            logger.warning(
-                "You have activated exllama and exllamav2 backend. Setting `disable_exllama=True` and keeping `disable_exllamav2=False`"
-            )
-            self.disable_exllama = True
 
     def to_dict(self):
         """
@@ -214,7 +205,6 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st
             group_size=self.group_size,
             bits=self.bits,
             disable_exllama=self.disable_exllama,
-            disable_exllamav2=self.disable_exllamav2,
         )
         if isinstance(module, QuantLinear):
             return
@@ -450,21 +440,13 @@ def tmp(_, input, output):
             layer_inputs, layer_outputs = layer_outputs, []
             torch.cuda.empty_cache()
 
-        if self.bits == 4:
-            # device not on gpu
+        if self.bits == 4 and not self.disable_exllama:
             if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
-                if not self.disable_exllama:
-                    logger.warning(
-                        "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
-                    )
-                    self.disable_exllama = True
-                if not self.disable_exllamav2:
-                    logger.warning(
-                        "Found modules on cpu/disk. Using Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllamav2=True`"
-                    )
-                    self.disable_exllamav2 = True
-            # act order and exllama
-            elif self.desc_act and not self.disable_exllama:
+                logger.warning(
+                    "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
+                )
+                self.disable_exllama = True
+            elif self.desc_act:
                 logger.warning(
                     "Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights."
                     "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. "
@@ -493,13 +475,13 @@ def post_init_model(self, model):
             model (`nn.Module`):
                 The input model
         """
-        if self.bits == 4 and (not self.disable_exllama or not self.disable_exllamav2):
+        if self.bits == 4 and not self.disable_exllama:
             if get_device(model) == torch.device("cpu") or (
                 hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"])
             ):
                 raise ValueError(
-                    "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU."
-                    "You can deactivate exllama backend by setting `disable_exllama=True` or `disable_exllamav2=True` in the quantization config object"
+                    "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU."
+                    "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
                 )
 
         class StoreAttr(object):
@@ -532,7 +514,6 @@ def pack_model(
             group_size=self.group_size,
             bits=self.bits,
             disable_exllama=self.disable_exllama,
-            disable_exllamav2=self.disable_exllamav2,
         )
         logger.info("Packing model...")
         layers = get_layers(model)
@@ -598,8 +579,7 @@ def load_quantized_model(
     offload_folder: Optional[str] = None,
     offload_buffers: Optional[str] = None,
     offload_state_dict: bool = False,
-    disable_exllama: bool = True,
-    disable_exllamav2: bool = False,
+    disable_exllama: bool = False,
     max_input_length: Optional[int] = None,
 ):
     """
@@ -635,8 +615,6 @@ def load_quantized_model(
             picked contains `"disk"` values.
         disable_exllama (`bool`, defaults to `False`):
             Whether to use exllama backend. Only works with `bits` = 4.
-        disable_exllama (`bool`, defaults to `False`):
-            Whether to use exllamav2 backend. Only works with `bits` = 4.
         max_input_length (`Optional[int]`, defaults to `None`):
             The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
             It is specific to the exllama backend with act-order.
@@ -670,7 +648,6 @@ def load_quantized_model(
         ) from err
     quantizer = GPTQQuantizer.from_dict(quantize_config_dict)
     quantizer.disable_exllama = disable_exllama
-    quantizer.disable_exllamav2 = disable_exllamav2
     quantizer.max_input_length = max_input_length
 
     model = quantizer.convert_model(model)
diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py
index 57b74776ca0..45fdc262cee 100644
--- a/tests/benchmark/benchmark_gptq.py
+++ b/tests/benchmark/benchmark_gptq.py
@@ -103,11 +103,6 @@ def get_parser():
         default=None,
         help="Revision of the model to benchmark",
     )
-    parser.add_argument(
-        "--enable-flash",
-        action="store_true",
-        help="Use flash attention",
-    )
 
     return parser
 
@@ -129,8 +124,7 @@ def timing_cuda(
         start_event.record()
 
         if is_decoder:
-            with torch.backends.cuda.sdp_kernel(enable_flash=args.enable_flash, enable_math=False, enable_mem_efficient=False):
-                _ = model.generate(input_ids, attention_mask=masks, generation_config=generation_config)
+            _ = model.generate(input_ids, attention_mask=masks, generation_config=generation_config)
         else:
             _ = model(input_ids, masks)
         end_event.record()
@@ -163,9 +157,7 @@ def warmup(
             eos_token_id=None,  # This is required for min_new_tokens to actually have an effect.
         )
         model.generation_config.eos_token_id = None  # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect.
-        with torch.backends.cuda.sdp_kernel(enable_flash=args.enable_flash, enable_math=False, enable_mem_efficient=False):
-            print(input_ids)
-            res = model.generate(input_ids, generation_config=gen_config)
+        res = model.generate(input_ids, attention_mask=masks, generation_config=gen_config)
         assert res.shape[1] == new_tokens + input_ids.shape[1]
         del res
     else:
@@ -229,8 +221,7 @@ def benchmark_memory(
         )
 
         if is_decoder:
-            with torch.backends.cuda.sdp_kernel(enable_flash=args.enable_flash, enable_math=False, enable_mem_efficient=False):
-                _ = model.generate(input_ids, attention_mask=masks, generation_config=gen_config)
+            _ = model.generate(input_ids, attention_mask=masks, generation_config=gen_config)
         else:
             _ = model(input_ids, masks)
 
@@ -333,10 +324,7 @@ def benchmark_memory(
 else:
     with device:
         model = autoclass.from_pretrained(args.model, torch_dtype=torch.float16)
-        
-model.to_bettertransformer()
 torch.cuda.synchronize()
-
 load_end = time.time_ns()
 
 act_order = None
@@ -427,8 +415,7 @@ def benchmark_memory(
                 torch.cuda.reset_peak_memory_stats()
 
                 input_ids = torch.randint(1, model.config.vocab_size - 1, size=(batch_size, prompt_length)).to(device)
-                #masks = torch.ones(batch_size, prompt_length, dtype=torch.int32).to(device)
-                masks=None
+                masks = torch.ones(batch_size, prompt_length, dtype=torch.int32).to(device)
 
                 with torch.no_grad():
                     max_mem = benchmark_memory(
diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py
index 946b21e90bd..53ff1a722e5 100644
--- a/tests/gptq/test_quantization.py
+++ b/tests/gptq/test_quantization.py
@@ -46,7 +46,6 @@ class GPTQTest(unittest.TestCase):
     group_size = 128
     desc_act = False
     disable_exllama = True
-    disable_exllamav2 = True
 
     dataset = [
         "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
@@ -70,7 +69,6 @@ def setUpClass(cls):
             group_size=cls.group_size,
             desc_act=cls.desc_act,
             disable_exllama=cls.disable_exllama,
-            disable_exllamav2=cls.disable_exllamav2,
         )
 
         cls.quantized_model = cls.quantizer.quantize_model(cls.model_fp16, cls.tokenizer)
@@ -98,7 +96,6 @@ def test_quantized_layers_class(self):
             group_size=self.group_size,
             bits=self.bits,
             disable_exllama=self.disable_exllama,
-            disable_exllamav2=self.disable_exllamav2,
         )
         self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear)
 
@@ -136,18 +133,13 @@ def test_serialization(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model,
-                save_folder=tmpdirname,
-                device_map={"": 0},
-                disable_exllama=self.disable_exllama,
-                disable_exllamav2=self.disable_exllamav2,
+                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=self.disable_exllama
             )
             self.check_inference_correctness(quantized_model_from_saved)
 
 
 class GPTQTestExllama(GPTQTest):
     disable_exllama = False
-    disable_exllamav2 = True
     EXPECTED_OUTPUTS = set()
     EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
     EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.")
@@ -161,7 +153,6 @@ class GPTQTestActOrder(GPTQTest):
     EXPECTED_OUTPUTS.add("Hello my name is nathalie, I am a young girl from")
 
     disable_exllama = True
-    disable_exllamav2 = True
     desc_act = True
 
     def test_generate_quality(self):
@@ -187,7 +178,7 @@ def test_exllama_serialization(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, disable_exllamav2=True
+                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False
             )
             self.check_inference_correctness(quantized_model_from_saved)
 
@@ -206,12 +197,7 @@ def test_exllama_max_input_length(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model,
-                save_folder=tmpdirname,
-                device_map={"": 0},
-                disable_exllama=False,
-                max_input_length=4028,
-                disable_exllamav2=True,
+                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, max_input_length=4028
             )
 
             prompt = "I am in Paris and" * 1000
@@ -227,42 +213,6 @@ def test_exllama_max_input_length(self):
             quantized_model_from_saved.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
 
 
-class GPTQTestExllamav2(GPTQTest):
-    desc_act = False
-    disable_exllama = True
-    disable_exllamav2 = True
-
-    def test_generate_quality(self):
-        # don't need to test
-        pass
-
-    def test_serialization(self):
-        # don't need to test
-        pass
-
-    def test_exllama_serialization(self):
-        """
-        Test the serialization of the model and the loading of the quantized weights with exllamav2 kernel
-        """
-        from accelerate import init_empty_weights
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            self.quantizer.save(self.quantized_model, tmpdirname)
-            self.quantized_model.config.save_pretrained(tmpdirname)
-            with init_empty_weights():
-                empty_model = AutoModelForCausalLM.from_config(
-                    AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16
-                )
-            empty_model.tie_weights()
-            quantized_model_from_saved = load_quantized_model(
-                empty_model,
-                save_folder=tmpdirname,
-                device_map={"": 0},
-                disable_exllamav2=False,
-            )
-            self.check_inference_correctness(quantized_model_from_saved)
-
-
 class GPTQUtilsTest(unittest.TestCase):
     """
     Test utilities

From 1eaedeb7b6128f18951fd0f784c206de5fa24ea5 Mon Sep 17 00:00:00 2001
From: Marc Sun <marc.sun@hotmail.fr>
Date: Fri, 29 Sep 2023 16:32:52 +0000
Subject: [PATCH 12/19] update benchmark prefill and generate

---
 tests/benchmark/README.md | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md
index ea6dedc1523..02a44b7ee64 100644
--- a/tests/benchmark/README.md
+++ b/tests/benchmark/README.md
@@ -40,6 +40,7 @@ Bitsandbytes uses the fp4 scheme, with the compute in fp16.
 |quantization |act_order|bits|group_size|kernel|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Peak memory (MB)|
 |-----|---------|----|----------|------|-------------|----------------------|------------------|----------------|
 |None|None     |None|None      |None  |26.0         |36.958                |27.058            |29152.98        |
+| gptq | False | 4 | 128 | exllamav2 | 36.07 | 32.25 | 31.01 | 11313.75 |
 |gptq |False    |4   |128       |exllama|36.2         |33.711                |29.663            |10484.34        |
 |gptq |False    |4   |128       |autogptq-cuda-old|36.2         |46.44                 |21.53             |10344.62        |
 |bitsandbytes|None     |None|None      |None  |37.64        |52.00                 |19.23             |11018.36       |
@@ -49,6 +50,7 @@ Bitsandbytes uses the fp4 scheme, with the compute in fp16.
 |quantization |act_order|bits|group_size|kernel|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Peak memory (MB)|
 |-----|---------|----|----------|------|-------------|----------------------|------------------|----------------|
 |None|None     |None|None      |None  |26.0         |37.35                 |53.53             |30831.09        |
+| gptq | False | 4 | 128 | exllamav2 | 36.07 | 35.81 | 55.85  | 12112.42 |
 |gptq |False    |4   |128       |exllama|36.2         |37.25                 |53.68             |12162.43        |
 |gptq |False    |4   |128       |autogptq-cuda-old|36.2         |47.41                 |42.18             |12020.34        |
 |bitsandbytes|None     |None|None      |None  |37.64        |74.62                 |26.80             |12834.84       |
@@ -58,6 +60,7 @@ Bitsandbytes uses the fp4 scheme, with the compute in fp16.
 |quantization |act_order|bits|group_size|kernel           |Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Peak memory (MB)|
 |-----|---------|----|----------|-----------------|-------------|----------------------|------------------|----------------|
 |None|None     |None|None      |None             |26.0         |37.89                 |105.55            |34187.22        |
+| gptq | False | 4 | 128 | exllamav2 | 36.07 | 36.04 | 110.98 | 16387.19 |
 |gptq |False    |4   |128       |exllama          |36.2         |54.14                 |73.87             |15518.55        |
 |gptq |False    |4   |128       |autogptq-cuda-old|36.2         |60.98                 |65.59             |15374.67        |
 |bitsandbytes|None     |None|None      |None  |37.64        |80.24                 |49.85             |16187.69       |
@@ -67,6 +70,7 @@ Bitsandbytes uses the fp4 scheme, with the compute in fp16.
 |quantization |act_order|bits|group_size|kernel|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Peak memory (MB)|
 |-----|---------|----|----------|------|-------------|----------------------|------------------|----------------|
 |None|None     |None|None      |None  |26.0         |47.37                 |168.86            |40327.62        |
+| gptq | False | 4 | 128 | exllamav2 | 36.07 | 47.31 | 169.11 | 22463.02 |
 |gptq |False    |4   |128       |exllama|36.2         |73.57                 |108.73            |21864.56        |
 |gptq |False    |4   |128       |autogptq-cuda-old|36.2         |104.44                |76.59             |20987.68        |
 |bitsandbytes|None     |None|None      |None  |37.64        |91.29                 |87.63             |22894.02       |
@@ -76,6 +80,7 @@ Bitsandbytes uses the fp4 scheme, with the compute in fp16.
 |quantization |act_order|bits|group_size|kernel|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Peak memory (MB)|
 |-----|---------|----|----------|------|-------------|----------------------|------------------|----------------|
 |None|None     |None|None      |None  |26.0         |69.94                 |228.76            |53986.51        |
+| gptq | False | 4 | 128 | exllamav2 | 36.07 | 83.09 | 192.55 | 35740.95 |
 |gptq |False    |4   |128       |exllama|36.2         |95.41                 |167.68            |34777.04        |
 |gptq |False    |4   |128       |autogptq-cuda-old|36.2         |192.48                |83.12             |35497.62        |
 |bitsandbytes|None     |None|None      |None  |37.64        |113.98                |140.38            |35532.37       |
@@ -109,6 +114,7 @@ The benchmark below is for a prompt length of 512, measuring only the prefill st
 |quantization |act_order|bits|group_size|kernel           |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
 |-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------|
 |None|None     |None|None      |None             |512          |1         |27.22        |96.38                 |10.38             |27999.54       |
+| gptq | False | 4 | 128 | exllamav2 | 512 | 1 | 6.63 | 116.07  | 8.62  | 10260.35 |
 |gptq |False    |4   |128       |exllama          |512          |1         |38.35        |112.54                |8.89              |9330.89        |
 |gptq |False    |4   |128       |autogptq-cuda-old|512          |1         |43.94        |368.13                |2.72              |9474.19        |
 |bitsandbytes|None|None|None|None|512|1  |37.46|139.17 |7.19 |9952.65 |
@@ -118,6 +124,7 @@ The benchmark below is for a prompt length of 512, measuring only the prefill st
 |quantization |act_order|bits|group_size|kernel           |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
 |-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------|
 |None|None     |None|None      |None             |512          |1         |27.22        |169.95                |11.77             |28524.37       |
+| gptq | False | 4 | 128 | exllamav2 | 512 | 1 | 6.63 | 212.07  | 9.43  | 10783.60 |
 |gptq |False    |4   |128       |exllama          |512          |1         |38.35        |190.44                |10.50             |9855.71        |
 |gptq |False    |4   |128       |autogptq-cuda-old|512          |1         |43.94        |443.80                |4.51              |9928.23        |
 |bitsandbytes|None|None|None|None|512|1  |37.46|212.76 |9.40 |10421.89|
@@ -127,6 +134,7 @@ The benchmark below is for a prompt length of 512, measuring only the prefill st
 |quantization |act_order|bits|group_size|kernel           |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
 |-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------|
 |None|None     |None|None      |None             |512          |1         |27.22        |305.99                |13.07             |29574.01       |
+| gptq | False | 4 | 128 | exllamav2 | 512 | 1 | 6.63 | 385.58  | 10.37 | 11829.59 |
 |gptq |False    |4   |128       |exllama          |512          |1         |38.35        |345.54                |11.58             |10905.35       |
 |gptq |False    |4   |128       |autogptq-cuda-old|512          |1         |43.94        |597.24                |6.70              |10838.42       |
 |bitsandbytes|None|None|None|None|512|1  |37.46|349.18 |11.46|11440.08|
@@ -136,17 +144,19 @@ The benchmark below is for a prompt length of 512, measuring only the prefill st
 |quantization |act_order|bits|group_size|kernel           |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
 |-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------|
 |None|None     |None|None      |None             |512          |1         |27.22        |600.47                |13.32             |31673.30       |
+| gptq | False | 4 | 128 | exllamav2 | 512 | 1 | 6.63 | 753.06  | 10.62 | 13920.50 |
 |gptq |False    |4   |128       |exllama          |512          |1         |38.35        |659.61                |12.13             |13004.64       |
 |gptq |False    |4   |128       |autogptq-cuda-old|512          |1         |43.94        |909.09                |8.80              |12862.18       |
 |bitsandbytes|None|None|None|None|512|1  |37.46|643.42 |12.43|13539.37|
 
 ### Batch size = 16
 
-|quantization |act_order|bits|group_size|kernel           |num_batches|batch_size|prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
-|-----|---------|----|----------|-----------------|-----------|----------|-------------|----------|-------------|----------------------|------------------|---------------|
-|None|None     |None|None      |None             |10         |16        |512          |1         |27.22        |1209.07               |13.23             |35871.88       |
-|gptq |False    |4   |128       |exllama          |10         |16        |512          |1         |38.35        |1280.25               |12.50             |17203.22       |
-|gptq |False    |4   |128       |autogptq-cuda-old|10         |16        |512          |1         |43.94        |1533.54               |10.43             |17060.76       |
+|quantization |act_order|bits|group_size|kernel    |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)|
+|-----|---------|----|-----------|----------|-------------|----------|-------------|----------------------|------------------|---------------|
+|None|None     |None|None      |None        |512          |1         |27.22        |1209.07               |13.23             |35871.88       |
+| gptq | False | 4 | 128 | exllamav2 | 512 | 1 | 6.63 | 1467.36 | 10.90 | 18104.44 |
+|gptq |False    |4   |128       |exllama     |512          |1         |38.35        |1280.25               |12.50             |17203.22       |
+|gptq |False    |4   |128       |autogptq-cuda-old |512          |1         |43.94        |1533.54               |10.43             |17060.76       |
 |bitsandbytes|None|None|None|None|512|1  |37.46|1256.88|12.73|17737.95|
 
 ## Perplexity benchmark results

From a6235567ebbc3083975bd377e35ec657f433604b Mon Sep 17 00:00:00 2001
From: Marc Sun <marc@huggingface.co>
Date: Tue, 24 Oct 2023 20:08:59 +0200
Subject: [PATCH 13/19] replace by use_exllama_v2

---
 tests/benchmark/README.md         | 21 ++++++++++-----------
 tests/benchmark/benchmark_gptq.py |  8 ++++----
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md
index 02a44b7ee64..e6e778c3431 100644
--- a/tests/benchmark/README.md
+++ b/tests/benchmark/README.md
@@ -4,7 +4,7 @@ Please refer to https://medium.com/pytorch/bettertransformer-out-of-the-box-perf
 
 # GPTQ benchmark
 
-The results below are for AutoGPTQ 0.4.2, PyTorch 2.0.1, bitsandbytes 0.41.1, transformers 4.32.
+The results below are for AutoGPTQ 0.5.0, PyTorch 2.0.1, bitsandbytes 0.41.1, transformers 4.35.
 
 ## Generation benchmark results
 
@@ -15,13 +15,13 @@ Run
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation --generate
 
 # GPTQ with exllamav2 kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --generate
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --use-exllama-v2 --generate 
 
 # GPTQ with exllama kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --disable-exllamav2 --task text-generation --generate
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --generate
 
 # GPTQ without exllama kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --disable-exllama  --disable-exllamav2 --generate
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --disable-exllama --generate
 
 # using bitsandbytes fp4/fp16 scheme
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation --bitsandbytes --generate
@@ -94,14 +94,13 @@ Run
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --generate
 
 # GPTQ with exllamav2 kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --generate
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --use_exllama_v2 --generate 
 
 # GPTQ with exllamav kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --disable-exllamav2 --sweep --num-batches 10 --gptq --task text-generation --prefill --generate
-
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --generate
 
 # GPTQ without exllama kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --disable-exllama --disable-exllamav2 --generate
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --disable-exllama --generate
 
 # using bitsandbytes fp4/fp16 scheme
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --bitsandbytes --generate
@@ -168,13 +167,13 @@ Run
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --task text-generation --ppl
 
 # GPTQ with exllamav2 kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --ppl
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --use_exllama_v2 --ppl
 
 # GPTQ with exllama kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --disable-exllamav2 --task text-generation --ppl
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq  --task text-generation --ppl
 
 # GPTQ without exllama kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --disable-exllama  --disable-exllamav2 --ppl
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --disable-exllama --ppl
 
 # using bitsandbytes fp4/fp16 scheme
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf ---task text-generation --bitsandbytes --ppl
diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py
index 45fdc262cee..fc26e54c9bb 100644
--- a/tests/benchmark/benchmark_gptq.py
+++ b/tests/benchmark/benchmark_gptq.py
@@ -84,9 +84,9 @@ def get_parser():
         help="Disable Exllama kernel, to rather use the AutoGPTQ CUDA (act-order case) or CUDA-old (no act-order case) kernels.",
     )
     parser.add_argument(
-        "--disable-exllamav2",
+        "--use-exllama-v2",
         action="store_true",
-        help="Disable Exllama kernel, to rather use the AutoGPTQ CUDA (act-order case) or CUDA-old (no act-order case) kernels.",
+        help="Use Exllamav2 kernel. It will disable exllama kernels by default",
     )
     parser.add_argument(
         "--generate",
@@ -305,7 +305,7 @@ def benchmark_memory(
 load_start = time.time_ns()
 if args.gptq:
     quantization_config = GPTQConfig(
-        bits=4, disable_exllama=args.disable_exllama, disable_exllamav2=args.disable_exllamav2
+        bits=4, disable_exllama=args.disable_exllama, use_exllama_v2=args.use_exllama_v2
     )
     model = autoclass.from_pretrained(
         args.model,
@@ -338,7 +338,7 @@ def benchmark_memory(
     bits = quantization_config_dict["bits"]
     group_size = quantization_config_dict["group_size"]
 
-    if not args.disable_exllamav2:
+    if args.use_exllama_v2:
         kernel = "exllamav2"
     elif not args.disable_exllama:
         # Exllama kernel can handle both the act-order / no act-order cases.

From 26d87e421b115513795f00e45177b44cd6fba1a9 Mon Sep 17 00:00:00 2001
From: Marc Sun <marc@huggingface.co>
Date: Fri, 27 Oct 2023 23:10:05 +0200
Subject: [PATCH 14/19] update benchmark arg

---
 tests/benchmark/README.md         | 16 ++++++++--------
 tests/benchmark/benchmark_gptq.py | 12 +++++-------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md
index e6e778c3431..0394b9cb3c3 100644
--- a/tests/benchmark/README.md
+++ b/tests/benchmark/README.md
@@ -18,10 +18,10 @@ CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-c
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --use-exllama-v2 --generate 
 
 # GPTQ with exllama kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --generate
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --use-exllama --generate
 
 # GPTQ without exllama kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --disable-exllama --generate
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --generate
 
 # using bitsandbytes fp4/fp16 scheme
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation --bitsandbytes --generate
@@ -94,13 +94,13 @@ Run
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --generate
 
 # GPTQ with exllamav2 kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --use_exllama_v2 --generate 
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --use-exllama-v2 --generate 
 
 # GPTQ with exllamav kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --generate
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --use-exllama --generate
 
 # GPTQ without exllama kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --disable-exllama --generate
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --generate
 
 # using bitsandbytes fp4/fp16 scheme
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --bitsandbytes --generate
@@ -167,13 +167,13 @@ Run
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --task text-generation --ppl
 
 # GPTQ with exllamav2 kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --use_exllama_v2 --ppl
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --use-exllama-v2 --ppl
 
 # GPTQ with exllama kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq  --task text-generation --ppl
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq  --task text-generation --use-exllama --ppl
 
 # GPTQ without exllama kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --disable-exllama --ppl
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --ppl
 
 # using bitsandbytes fp4/fp16 scheme
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf ---task text-generation --bitsandbytes --ppl
diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py
index fc26e54c9bb..9f5f6d3dcee 100644
--- a/tests/benchmark/benchmark_gptq.py
+++ b/tests/benchmark/benchmark_gptq.py
@@ -79,9 +79,9 @@ def get_parser():
         help="Use the parameter ranges for (batch_size, prompt_length, new_tokens) defined in the .py file instead of the CLI ones.",
     )
     parser.add_argument(
-        "--disable-exllama",
+        "--use-exllama",
         action="store_true",
-        help="Disable Exllama kernel, to rather use the AutoGPTQ CUDA (act-order case) or CUDA-old (no act-order case) kernels.",
+        help="Use Exllama kernel, to rather use the AutoGPTQ CUDA (act-order case) or CUDA-old (no act-order case) kernels.",
     )
     parser.add_argument(
         "--use-exllama-v2",
@@ -236,7 +236,7 @@ def benchmark_memory(
 
     # I am not sure whether we should substract here `inactive_split_bytes.all.peak` (not sure what it corresponds to, though it can get quite large, in the several GB).
     peak_external_mb = peak_nvml_mb - peak_reserved_torch_mb
-    assert peak_external_mb > 0
+    # assert peak_external_mb > 0
 
     # This formula is to confirm. We measure the actual allocated PyTorch memory, plus the additional non-PyTorch memory (as the CUDA context, CUDA extension device memory). We need to substract the PyTorch peak reserved memory since this one appears in the peak nvidia-smi/nvmlDeviceGetMemoryInfo.
 
@@ -304,9 +304,7 @@ def benchmark_memory(
 
 load_start = time.time_ns()
 if args.gptq:
-    quantization_config = GPTQConfig(
-        bits=4, disable_exllama=args.disable_exllama, use_exllama_v2=args.use_exllama_v2
-    )
+    quantization_config = GPTQConfig(bits=4, use_exllama=args.use_exllama, use_exllama_v2=args.use_exllama_v2)
     model = autoclass.from_pretrained(
         args.model,
         revision=args.revision,
@@ -340,7 +338,7 @@ def benchmark_memory(
 
     if args.use_exllama_v2:
         kernel = "exllamav2"
-    elif not args.disable_exllama:
+    elif args.use_exllama:
         # Exllama kernel can handle both the act-order / no act-order cases.
         kernel = "exllama"
     elif act_order:

From 4f797b114818fdf5f7c92a077100ba9733de1695 Mon Sep 17 00:00:00 2001
From: Marc Sun <marc@huggingface.co>
Date: Wed, 1 Nov 2023 14:36:38 +0100
Subject: [PATCH 15/19] switch to a config_dict instead of disable_exllamav2

---
 .../usage_guides/quantization.mdx             |   4 +-
 optimum/gptq/quantizer.py                     | 105 +++++++++++-------
 optimum/utils/import_utils.py                 |   2 +-
 tests/gptq/test_quantization.py               |  18 ++-
 4 files changed, 76 insertions(+), 53 deletions(-)

diff --git a/docs/source/llm_quantization/usage_guides/quantization.mdx b/docs/source/llm_quantization/usage_guides/quantization.mdx
index 52e842e9b16..0757464f22b 100644
--- a/docs/source/llm_quantization/usage_guides/quantization.mdx
+++ b/docs/source/llm_quantization/usage_guides/quantization.mdx
@@ -89,7 +89,7 @@ empty_model.tie_weights()
 quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto")
 ```
 
-If you wish to use exllama kernels, you will have to disable exllamav2 kernels:
+If you wish to use exllama kernels, you will have to change the version by setting `exllama_config`:
 
 ```py
 from optimum.gptq import GPTQQuantizer, load_quantized_model
@@ -99,7 +99,7 @@ from accelerate import init_empty_weights
 with init_empty_weights():
     empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
 empty_model.tie_weights()
-quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllamav2=True)
+quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", exllama_config = {"version":1})
 ```
 
 Note that only 4-bit models are supported with exllama/exllamav2 kernels for now. Furthermore, it is recommended to disable exllama/exllamav2 kernels when you are finetuning your model with peft.
diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index 103015642c6..ea32035da32 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -15,6 +15,7 @@
 import copy
 import json
 import os
+from enum import Enum
 from logging import getLogger
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -49,6 +50,11 @@
 logger = getLogger(__name__)
 
 
+class ExllamaVersion(int, Enum):
+    ONE = 1
+    TWO = 2
+
+
 class GPTQQuantizer(object):
     r"""
     A simple API for GPTQ Quantization
@@ -69,8 +75,8 @@ def __init__(
         module_name_preceding_first_block: Optional[List[str]] = None,
         batch_size: int = 1,
         pad_token_id: Optional[int] = None,
-        disable_exllama: Optional[bool] = None,
-        disable_exllamav2: bool = False,
+        disable_exllama: bool = False,
+        exllama_config: Dict[str, Any] = None,
         max_input_length: Optional[int] = None,
         cache_block_outputs: Optional[bool] = True,
         *args,
@@ -109,10 +115,10 @@ def __init__(
                 The batch size of the dataset
             pad_token_id (`Optional[int]`, defaults to `None`):
                 The pad token id. Needed to prepare the dataset when `batch_size` > 1.
-            disable_exllama (`Optional[bool]`, defaults to `None`):
+            disable_exllama (`bool`, defaults to `False`):
                 Whether to use exllama backend. Only works with `bits` = 4.
-            disable_exllamav2 (`bool`, defaults to `False`):
-                Whether to use exllamav2 backend. Only works with `bits` = 4.
+            exllama_config (`Dict[str, Any]`, *optional*):
+                The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
             max_input_length (`Optional[int]`, defaults to `None`):
                 The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
                 It is specific to the exllama backend with act-order.
@@ -135,7 +141,7 @@ def __init__(
         self.batch_size = batch_size
         self.pad_token_id = pad_token_id
         self.disable_exllama = disable_exllama
-        self.disable_exllamav2 = disable_exllamav2
+        self.exllama_config = exllama_config
         self.max_input_length = max_input_length
         self.quant_method = QuantizationMethod.GPTQ
         self.cache_block_outputs = cache_block_outputs
@@ -146,16 +152,18 @@ def __init__(
             raise ValueError("group_size must be greater than 0 or equal to -1")
         if not (0 < self.damp_percent < 1):
             raise ValueError("damp_percent must between 0 and 1.")
-        if not self.disable_exllamav2 and not self.disable_exllama:
-            raise ValueError(
-                "disable_exllamav2 and disable_exllama are both set to `False`. Please disable one of the kernels."
-            )
-        # If disable_exllamav2 is True, we want to fall back on the exllama kernel and not the cuda/cuda_old ones.
-        if self.disable_exllama is None:
-            if self.disable_exllamav2:
-                self.disable_exllama = False
-            else:
-                self.disable_exllama = True
+
+        if self.exllama_config is None:
+            self.exllama_config = {"version": ExllamaVersion.TWO}
+        else:
+            if "version" not in self.exllama_config:
+                raise ValueError("`exllama_config` needs to have a `version` key")
+            elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
+                version = self.exllama_config["version"]
+                raise ValueError(
+                    f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
+                )
+        self.exllama_version = self.exllama_config["version"]
 
     def to_dict(self):
         """
@@ -223,8 +231,8 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st
             desc_act=self.desc_act,
             group_size=self.group_size,
             bits=self.bits,
-            disable_exllama=self.disable_exllama,
-            disable_exllamav2=self.disable_exllamav2,
+            disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
+            disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
         )
         if isinstance(module, QuantLinear):
             return
@@ -245,10 +253,18 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st
                     out_features = layer.weight.shape[1]
                 if not (self.desc_act) or self.group_size == -1:
                     new_layer = QuantLinear(
-                        self.bits, self.group_size, in_features, out_features, True, use_cuda_fp16=self.use_cuda_fp16, weight_dtype=layer.weight.dtype
+                        self.bits,
+                        self.group_size,
+                        in_features,
+                        out_features,
+                        True,
+                        use_cuda_fp16=self.use_cuda_fp16,
+                        weight_dtype=layer.weight.dtype,
                     )
                 else:
-                    new_layer = QuantLinear(self.bits, self.group_size, in_features, out_features, True, weight_dtype=layer.weight.dtype)
+                    new_layer = QuantLinear(
+                        self.bits, self.group_size, in_features, out_features, True, weight_dtype=layer.weight.dtype
+                    )
                 new_layer.device = device
                 setattr(module, attr, new_layer.to(device))
         for name1, child in module.named_children():
@@ -483,22 +499,22 @@ def tmp(_, input, output):
             if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
                 if not self.disable_exllama:
                     logger.warning(
-                        "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
+                        "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
                     )
                     self.disable_exllama = True
             # act order and exllama
-            elif self.desc_act and not self.disable_exllama:
+            elif self.desc_act and not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE:
                 logger.warning(
                     "Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights."
                     "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. "
                 )
                 self.disable_exllama = True
-            elif not self.disable_exllamav2:
+            elif not self.disable_exllama and self.exllama_version == ExllamaVersion.TWO:
                 logger.warning(
                     "Using Exllamav2 backend will reorder the weights offline, thus you will not be able to save the model with the right weights."
-                    "Setting `disable_exllamav2=True`. You should only use Exllamav2 backend for inference. "
+                    "Setting `disable_exllama=True`. You should only use Exllamav2 backend for inference. "
                 )
-                self.disable_exllamav2 = True
+                self.disable_exllama = True
         # Step 4: Pack the model at the end (Replacing the layers)
         self.pack_model(model=model, quantizers=quantizers)
 
@@ -522,13 +538,13 @@ def post_init_model(self, model):
             model (`nn.Module`):
                 The input model
         """
-        if self.bits == 4 and (not self.disable_exllama or not self.disable_exllamav2):
+        if self.bits == 4 and not self.disable_exllama:
             if get_device(model) == torch.device("cpu") or (
                 hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"])
             ):
                 raise ValueError(
                     "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU."
-                    "You can deactivate exllama backend by setting `disable_exllama=True` or `disable_exllamav2=True` in the quantization config object"
+                    "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
                 )
 
         class StoreAttr(object):
@@ -537,7 +553,11 @@ class StoreAttr(object):
         model.quantize_config = StoreAttr()
         model.quantize_config.desc_act = self.desc_act
         model = autogptq_post_init(model, use_act_order=self.desc_act)
-        if self.desc_act and not self.disable_exllama and self.max_input_length is not None:
+        if (
+            self.desc_act
+            and (not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE)
+            and self.max_input_length is not None
+        ):
             model = exllama_set_max_input_length(model, self.max_input_length)
         return model
 
@@ -560,8 +580,8 @@ def pack_model(
             desc_act=self.desc_act,
             group_size=self.group_size,
             bits=self.bits,
-            disable_exllama=self.disable_exllama,
-            disable_exllamav2=self.disable_exllamav2,
+            disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
+            disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
         )
         logger.info("Packing model...")
         layers = get_layers(model)
@@ -627,8 +647,8 @@ def load_quantized_model(
     offload_folder: Optional[str] = None,
     offload_buffers: Optional[str] = None,
     offload_state_dict: bool = False,
-    disable_exllama: Optional[bool] = None,
-    disable_exllamav2: bool = False,
+    disable_exllama: bool = False,
+    exllama_config: Dict[str, Any] = None,
     max_input_length: Optional[int] = None,
 ):
     """
@@ -664,8 +684,8 @@ def load_quantized_model(
             picked contains `"disk"` values.
         disable_exllama (`Optional[bool]`, defaults to `None`):
             Whether to use exllama backend. Only works with `bits` = 4.
-        disable_exllama (`bool`, defaults to `False`):
-            Whether to use exllamav2 backend. Only works with `bits` = 4.
+        exllama_config (`Dict[str, Any]`, *optional*):
+            The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
         max_input_length (`Optional[int]`, defaults to `None`):
             The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.
             It is specific to the exllama backend with act-order.
@@ -686,11 +706,16 @@ def load_quantized_model(
         device_map = {"": torch.cuda.current_device()}
         logger.info("The device_map was not initialized." "Setting device_map to `{'':torch.cuda.current_device()}`.")
 
-    if disable_exllama is None:
-        if disable_exllamav2:
-            disable_exllama = False
-        else:
-            disable_exllama = True
+    if exllama_config is None:
+        exllama_config = {"version": ExllamaVersion.TWO}
+    else:
+        if "version" not in exllama_config:
+            raise ValueError("`exllama_config` needs to have a `version` key")
+        elif exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
+            version = exllama_config["version"]
+            raise ValueError(
+                f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}"
+            )
 
     # this branch will check if model is from huggingface
     try:
@@ -705,7 +730,7 @@ def load_quantized_model(
         ) from err
     quantizer = GPTQQuantizer.from_dict(quantize_config_dict)
     quantizer.disable_exllama = disable_exllama
-    quantizer.disable_exllamav2 = disable_exllamav2
+    quantizer.exllama_config = exllama_config
     quantizer.max_input_length = max_input_length
 
     model = quantizer.convert_model(model)
diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py
index f262548975b..d1729d85991 100644
--- a/optimum/utils/import_utils.py
+++ b/optimum/utils/import_utils.py
@@ -35,7 +35,7 @@
 TORCH_MINIMUM_VERSION = packaging.version.parse("1.11.0")
 TRANSFORMERS_MINIMUM_VERSION = packaging.version.parse("4.25.0")
 DIFFUSERS_MINIMUM_VERSION = packaging.version.parse("0.18.0")
-AUTOGPTQ_MINIMUM_VERSION = packaging.version.parse("0.5.0")
+AUTOGPTQ_MINIMUM_VERSION = packaging.version.parse("0.4.2")
 
 
 # This is the minimal required version to support some ONNX Runtime features
diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py
index e5811f91a87..2ebec3f4aad 100644
--- a/tests/gptq/test_quantization.py
+++ b/tests/gptq/test_quantization.py
@@ -46,7 +46,7 @@ class GPTQTest(unittest.TestCase):
     group_size = 128
     desc_act = False
     disable_exllama = True
-    disable_exllamav2 = True
+    exllama_config = None
     cache_block_outputs = True
 
     dataset = [
@@ -71,7 +71,7 @@ def setUpClass(cls):
             group_size=cls.group_size,
             desc_act=cls.desc_act,
             disable_exllama=cls.disable_exllama,
-            disable_exllamav2=cls.disable_exllamav2,
+            exllama_config=cls.exllama_config,
             cache_block_outputs=cls.cache_block_outputs,
         )
 
@@ -99,8 +99,8 @@ def test_quantized_layers_class(self):
             desc_act=self.desc_act,
             group_size=self.group_size,
             bits=self.bits,
-            disable_exllama=self.disable_exllama,
-            disable_exllamav2=self.disable_exllamav2,
+            disable_exllama=self.disable_exllama or self.exllama_config["version"] != 1,
+            disable_exllamav2=self.disable_exllama or self.exllama_config["version"] != 2,
         )
         self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear)
 
@@ -142,14 +142,14 @@ def test_serialization(self):
                 save_folder=tmpdirname,
                 device_map={"": 0},
                 disable_exllama=self.disable_exllama,
-                disable_exllamav2=self.disable_exllamav2,
+                exllama_config=self.exllama_config,
             )
             self.check_inference_correctness(quantized_model_from_saved)
 
 
 class GPTQTestExllama(GPTQTest):
     disable_exllama = False
-    disable_exllamav2 = True
+    exllama_config = {"version": 1}
     EXPECTED_OUTPUTS = set()
     EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
     EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.")
@@ -163,7 +163,6 @@ class GPTQTestActOrder(GPTQTest):
     EXPECTED_OUTPUTS.add("Hello my name is nathalie, I am a young girl from")
 
     disable_exllama = True
-    disable_exllamav2 = True
     desc_act = True
 
     def test_generate_quality(self):
@@ -189,7 +188,7 @@ def test_exllama_serialization(self):
                 )
             empty_model.tie_weights()
             quantized_model_from_saved = load_quantized_model(
-                empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllamav2=True
+                empty_model, save_folder=tmpdirname, device_map={"": 0}, exllama_config={"version": 1}
             )
             self.check_inference_correctness(quantized_model_from_saved)
 
@@ -212,7 +211,7 @@ def test_exllama_max_input_length(self):
                 save_folder=tmpdirname,
                 device_map={"": 0},
                 max_input_length=4028,
-                disable_exllamav2=True,
+                exllama_config={"version": 1},
             )
 
             prompt = "I am in Paris and" * 1000
@@ -231,7 +230,6 @@ def test_exllama_max_input_length(self):
 class GPTQTestExllamav2(GPTQTest):
     desc_act = False
     disable_exllama = True
-    disable_exllamav2 = True
 
     def test_generate_quality(self):
         # don't need to test

From 1d845c79e73aeec71a6bbdbd43e43bad6ee4fc2b Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Wed, 1 Nov 2023 11:23:53 -0400
Subject: [PATCH 16/19] Apply suggestions from code review

Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com>
---
 optimum/gptq/quantizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index ea32035da32..56eca366051 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -648,7 +648,7 @@ def load_quantized_model(
     offload_buffers: Optional[str] = None,
     offload_state_dict: bool = False,
     disable_exllama: bool = False,
-    exllama_config: Dict[str, Any] = None,
+    exllama_config: Optional[Dict[str, Any]] = None,
     max_input_length: Optional[int] = None,
 ):
     """
@@ -684,7 +684,7 @@ def load_quantized_model(
             picked contains `"disk"` values.
         disable_exllama (`Optional[bool]`, defaults to `None`):
             Whether to use exllama backend. Only works with `bits` = 4.
-        exllama_config (`Dict[str, Any]`, *optional*):
+        exllama_config (`Optional[Dict[str, Any]]`, defaults to `None`):
             The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset.
         max_input_length (`Optional[int]`, defaults to `None`):
             The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length.

From d5b298fdd04cde020203ee28faf6e7fa12a7fd01 Mon Sep 17 00:00:00 2001
From: Marc Sun <marc@huggingface.co>
Date: Wed, 1 Nov 2023 16:24:43 +0100
Subject: [PATCH 17/19] better tests

---
 optimum/gptq/quantizer.py       | 1 +
 tests/gptq/test_quantization.py | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index ea32035da32..e209c3d5bd8 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -731,6 +731,7 @@ def load_quantized_model(
     quantizer = GPTQQuantizer.from_dict(quantize_config_dict)
     quantizer.disable_exllama = disable_exllama
     quantizer.exllama_config = exllama_config
+    quantizer.exllama_version = quantizer.exllama_config["version"]
     quantizer.max_input_length = max_input_length
 
     model = quantizer.convert_model(model)
diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py
index 2ebec3f4aad..de2acddf6f2 100644
--- a/tests/gptq/test_quantization.py
+++ b/tests/gptq/test_quantization.py
@@ -104,6 +104,9 @@ def test_quantized_layers_class(self):
         )
         self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear)
 
+    def check_quantized_layers_type(self, model, value):
+        self.assertTrue(model.transformer.h[0].mlp.dense_4h_to_h.QUANT_TYPE == value)
+
     def check_inference_correctness(self, model):
         """
         Test the generation quality of the quantized model and see that we are matching the expected output.
@@ -118,6 +121,7 @@ def check_inference_correctness(self, model):
 
         # Check the exactness of the result
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+        
 
     def test_generate_quality(self):
         self.check_inference_correctness(self.quantized_model)
@@ -144,6 +148,7 @@ def test_serialization(self):
                 disable_exllama=self.disable_exllama,
                 exllama_config=self.exllama_config,
             )
+            self.check_quantized_layers_type(quantized_model_from_saved,"cuda-old")
             self.check_inference_correctness(quantized_model_from_saved)
 
 
@@ -190,6 +195,7 @@ def test_exllama_serialization(self):
             quantized_model_from_saved = load_quantized_model(
                 empty_model, save_folder=tmpdirname, device_map={"": 0}, exllama_config={"version": 1}
             )
+            self.check_quantized_layers_type(quantized_model_from_saved,"exllama")
             self.check_inference_correctness(quantized_model_from_saved)
 
     def test_exllama_max_input_length(self):
@@ -213,6 +219,7 @@ def test_exllama_max_input_length(self):
                 max_input_length=4028,
                 exllama_config={"version": 1},
             )
+            self.check_quantized_layers_type(quantized_model_from_saved,"exllama")
 
             prompt = "I am in Paris and" * 1000
             inp = self.tokenizer(prompt, return_tensors="pt").to(0)
@@ -258,6 +265,7 @@ def test_exllama_serialization(self):
                 save_folder=tmpdirname,
                 device_map={"": 0},
             )
+            self.check_quantized_layers_type(quantized_model_from_saved,"exllamav2")
             self.check_inference_correctness(quantized_model_from_saved)
 
 

From c21601d8ec1000968775acc7c50c130dce0c5636 Mon Sep 17 00:00:00 2001
From: Marc Sun <marc@huggingface.co>
Date: Wed, 1 Nov 2023 16:26:30 +0100
Subject: [PATCH 18/19] style

---
 tests/gptq/test_quantization.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py
index de2acddf6f2..7f50a57496a 100644
--- a/tests/gptq/test_quantization.py
+++ b/tests/gptq/test_quantization.py
@@ -121,7 +121,6 @@ def check_inference_correctness(self, model):
 
         # Check the exactness of the result
         self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
-        
 
     def test_generate_quality(self):
         self.check_inference_correctness(self.quantized_model)
@@ -148,7 +147,7 @@ def test_serialization(self):
                 disable_exllama=self.disable_exllama,
                 exllama_config=self.exllama_config,
             )
-            self.check_quantized_layers_type(quantized_model_from_saved,"cuda-old")
+            self.check_quantized_layers_type(quantized_model_from_saved, "cuda-old")
             self.check_inference_correctness(quantized_model_from_saved)
 
 
@@ -195,7 +194,7 @@ def test_exllama_serialization(self):
             quantized_model_from_saved = load_quantized_model(
                 empty_model, save_folder=tmpdirname, device_map={"": 0}, exllama_config={"version": 1}
             )
-            self.check_quantized_layers_type(quantized_model_from_saved,"exllama")
+            self.check_quantized_layers_type(quantized_model_from_saved, "exllama")
             self.check_inference_correctness(quantized_model_from_saved)
 
     def test_exllama_max_input_length(self):
@@ -219,7 +218,7 @@ def test_exllama_max_input_length(self):
                 max_input_length=4028,
                 exllama_config={"version": 1},
             )
-            self.check_quantized_layers_type(quantized_model_from_saved,"exllama")
+            self.check_quantized_layers_type(quantized_model_from_saved, "exllama")
 
             prompt = "I am in Paris and" * 1000
             inp = self.tokenizer(prompt, return_tensors="pt").to(0)
@@ -265,7 +264,7 @@ def test_exllama_serialization(self):
                 save_folder=tmpdirname,
                 device_map={"": 0},
             )
-            self.check_quantized_layers_type(quantized_model_from_saved,"exllamav2")
+            self.check_quantized_layers_type(quantized_model_from_saved, "exllamav2")
             self.check_inference_correctness(quantized_model_from_saved)
 
 

From 11e71e260c7c8f7e6773da1f473d90b9836e9f89 Mon Sep 17 00:00:00 2001
From: Marc Sun <marc@huggingface.co>
Date: Wed, 1 Nov 2023 17:46:11 +0100
Subject: [PATCH 19/19] style

---
 tests/benchmark/README.md         |  6 +++---
 tests/benchmark/benchmark_gptq.py | 23 ++++++++++++++---------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md
index 0394b9cb3c3..b6d6f9cddad 100644
--- a/tests/benchmark/README.md
+++ b/tests/benchmark/README.md
@@ -15,7 +15,7 @@ Run
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation --generate
 
 # GPTQ with exllamav2 kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --use-exllama-v2 --generate 
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --use-exllama --exllama-version 2 --generate 
 
 # GPTQ with exllama kernel (int4/fp16)
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --use-exllama --generate
@@ -94,7 +94,7 @@ Run
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --generate
 
 # GPTQ with exllamav2 kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --use-exllama-v2 --generate 
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --use-exllama --exllama-version 2 --generate 
 
 # GPTQ with exllamav kernel (int4/fp16)
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --use-exllama --generate
@@ -167,7 +167,7 @@ Run
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --task text-generation --ppl
 
 # GPTQ with exllamav2 kernel (int4/fp16)
-CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --use-exllama-v2 --ppl
+CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --use-exllama --exllama-version 2 --ppl
 
 # GPTQ with exllama kernel (int4/fp16)
 CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq  --task text-generation --use-exllama --ppl
diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py
index 9f5f6d3dcee..29f986015a4 100644
--- a/tests/benchmark/benchmark_gptq.py
+++ b/tests/benchmark/benchmark_gptq.py
@@ -84,9 +84,10 @@ def get_parser():
         help="Use Exllama kernel, to rather use the AutoGPTQ CUDA (act-order case) or CUDA-old (no act-order case) kernels.",
     )
     parser.add_argument(
-        "--use-exllama-v2",
-        action="store_true",
-        help="Use Exllamav2 kernel. It will disable exllama kernels by default",
+        "--exllama-version",
+        type=int,
+        default=2,
+        help="Use Exllamav2 kernel. Set 1 in order to use exllama kernel",
     )
     parser.add_argument(
         "--generate",
@@ -304,7 +305,9 @@ def benchmark_memory(
 
 load_start = time.time_ns()
 if args.gptq:
-    quantization_config = GPTQConfig(bits=4, use_exllama=args.use_exllama, use_exllama_v2=args.use_exllama_v2)
+    quantization_config = GPTQConfig(
+        bits=4, use_exllama=args.use_exllama, exllama_config={"version": args.exllama_version}
+    )
     model = autoclass.from_pretrained(
         args.model,
         revision=args.revision,
@@ -335,12 +338,14 @@ def benchmark_memory(
     act_order = quantization_config_dict["desc_act"]
     bits = quantization_config_dict["bits"]
     group_size = quantization_config_dict["group_size"]
+    use_exllama = quantization_config_dict["use_exllama"]
+    exllama_version = quantization_config_dict["exllama_config"]["version"]
 
-    if args.use_exllama_v2:
-        kernel = "exllamav2"
-    elif args.use_exllama:
-        # Exllama kernel can handle both the act-order / no act-order cases.
-        kernel = "exllama"
+    if use_exllama:
+        if exllama_version == 2:
+            kernel = "exllamav2"
+        else:
+            kernel = "exllama"
     elif act_order:
         kernel = "autotogptq-cuda"
     else: