From 5db25419bf89819ea6a84528e84d737c4ffdae49 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 27 Sep 2023 11:19:13 +0000 Subject: [PATCH 01/19] add_exllamav2 --- .../usage_guides/quantization.mdx | 15 ++++++- optimum/gptq/quantizer.py | 45 ++++++++++++++----- tests/gptq/test_quantization.py | 45 +++++++++++++++++-- 3 files changed, 89 insertions(+), 16 deletions(-) diff --git a/docs/source/llm_quantization/usage_guides/quantization.mdx b/docs/source/llm_quantization/usage_guides/quantization.mdx index 2ec8d1f6683..5b669841829 100644 --- a/docs/source/llm_quantization/usage_guides/quantization.mdx +++ b/docs/source/llm_quantization/usage_guides/quantization.mdx @@ -89,8 +89,21 @@ empty_model.tie_weights() quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False) ``` -Note that only 4-bit models are supported with exllama kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft. +With the release of the exllamav2 kernel, you can get faster inference speed compared to the exllama kernels. You just need to +pass `disable_exllamav2` in [`~optimum.gptq.load_quantized_model`]: +```py +from optimum.gptq import GPTQQuantizer, load_quantized_model +import torch + +from accelerate import init_empty_weights +with init_empty_weights(): + empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) +empty_model.tie_weights() +quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllamav2=False) +``` + +Note that only 4-bit models are supported with exllama/exllamav2 kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft. #### Fine-tune a quantized model With the official support of adapters in the Hugging Face ecosystem, you can fine-tune models that have been quantized with GPTQ. diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 70e8dfa954e..ae975c9b8e0 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -69,7 +69,8 @@ def __init__( module_name_preceding_first_block: Optional[List[str]] = None, batch_size: int = 1, pad_token_id: Optional[int] = None, - disable_exllama: bool = False, + disable_exllama: bool = True, + disable_exllamav2: bool = False, max_input_length: Optional[int] = None, *args, **kwargs, @@ -107,8 +108,10 @@ def __init__( The batch size of the dataset pad_token_id (`Optional[int]`, defaults to `None`): The pad token id. Needed to prepare the dataset when `batch_size` > 1. - disable_exllama (`bool`, defaults to `False`): + disable_exllama (`bool`, defaults to `True`): Whether to use exllama backend. Only works with `bits` = 4. + disable_exllamav2 (`bool`, defaults to `False`): + Whether to use exllamav2 backend. Only works with `bits` = 4. max_input_length (`Optional[int]`, defaults to `None`): The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length. It is specific to the exllama backend with act-order. @@ -128,6 +131,7 @@ def __init__( self.batch_size = batch_size self.pad_token_id = pad_token_id self.disable_exllama = disable_exllama + self.disable_exllamav2 = disable_exllamav2 self.max_input_length = max_input_length self.quant_method = QuantizationMethod.GPTQ @@ -137,6 +141,9 @@ def __init__( raise ValueError("group_size must be greater than 0 or equal to -1") if not (0 < self.damp_percent < 1): raise ValueError("damp_percent must between 0 and 1.") + if not self.disable_exllamav2 and not self.disable_exllama: + logger.warning("You have activated exllama and exllamav2 backend. Setting `disable_exllama=True` and keeping `disable_exllamav2=False`") + self.disable_exllama=True def to_dict(self): """ @@ -205,6 +212,7 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st group_size=self.group_size, bits=self.bits, disable_exllama=self.disable_exllama, + disable_exllamav2=self.disable_exllamav2, ) if isinstance(module, QuantLinear): return @@ -440,13 +448,21 @@ def tmp(_, input, output): layer_inputs, layer_outputs = layer_outputs, [] torch.cuda.empty_cache() - if self.bits == 4 and not self.disable_exllama: + if self.bits == 4: + # device not on gpu if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])): - logger.warning( - "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`" - ) - self.disable_exllama = True - elif self.desc_act: + if not self.disable_exllama: + logger.warning( + "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`" + ) + self.disable_exllama = True + if not self.disable_exllamav2: + logger.warning( + "Found modules on cpu/disk. Using Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllamav2=True`" + ) + self.disable_exllamav2 = True + # act order and exllama + elif self.desc_act and not self.disable_exllama: logger.warning( "Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights." "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. " @@ -475,13 +491,13 @@ def post_init_model(self, model): model (`nn.Module`): The input model """ - if self.bits == 4 and not self.disable_exllama: + if self.bits == 4 and (not self.disable_exllama or not self.disable_exllamav2): if get_device(model) == torch.device("cpu") or ( hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"]) ): raise ValueError( - "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU." - "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object" + "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU." + "You can deactivate exllama backend by setting `disable_exllama=True` or `disable_exllamav2=True` in the quantization config object" ) class StoreAttr(object): @@ -514,6 +530,7 @@ def pack_model( group_size=self.group_size, bits=self.bits, disable_exllama=self.disable_exllama, + disable_exllamav2=self.disable_exllamav2, ) logger.info("Packing model...") layers = get_layers(model) @@ -579,7 +596,8 @@ def load_quantized_model( offload_folder: Optional[str] = None, offload_buffers: Optional[str] = None, offload_state_dict: bool = False, - disable_exllama: bool = False, + disable_exllama: bool = True, + disable_exllamav2: bool = False, max_input_length: Optional[int] = None, ): """ @@ -615,6 +633,8 @@ def load_quantized_model( picked contains `"disk"` values. disable_exllama (`bool`, defaults to `False`): Whether to use exllama backend. Only works with `bits` = 4. + disable_exllama (`bool`, defaults to `False`): + Whether to use exllamav2 backend. Only works with `bits` = 4. max_input_length (`Optional[int]`, defaults to `None`): The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length. It is specific to the exllama backend with act-order. @@ -648,6 +668,7 @@ def load_quantized_model( ) from err quantizer = GPTQQuantizer.from_dict(quantize_config_dict) quantizer.disable_exllama = disable_exllama + quantizer.disable_exllamav2 = disable_exllamav2 quantizer.max_input_length = max_input_length model = quantizer.convert_model(model) diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py index 53ff1a722e5..ad3e36be73a 100644 --- a/tests/gptq/test_quantization.py +++ b/tests/gptq/test_quantization.py @@ -46,6 +46,7 @@ class GPTQTest(unittest.TestCase): group_size = 128 desc_act = False disable_exllama = True + disable_exllamav2 = True dataset = [ "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." @@ -69,6 +70,7 @@ def setUpClass(cls): group_size=cls.group_size, desc_act=cls.desc_act, disable_exllama=cls.disable_exllama, + disable_exllamav2=cls.disable_exllamav2, ) cls.quantized_model = cls.quantizer.quantize_model(cls.model_fp16, cls.tokenizer) @@ -96,6 +98,7 @@ def test_quantized_layers_class(self): group_size=self.group_size, bits=self.bits, disable_exllama=self.disable_exllama, + disable_exllamav2=self.disable_exllamav2, ) self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear) @@ -133,13 +136,14 @@ def test_serialization(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=self.disable_exllama + empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=self.disable_exllama, disable_exllamav2=self.disable_exllamav2 ) self.check_inference_correctness(quantized_model_from_saved) class GPTQTestExllama(GPTQTest): disable_exllama = False + disable_exllamav2 = True EXPECTED_OUTPUTS = set() EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I") EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.") @@ -153,6 +157,7 @@ class GPTQTestActOrder(GPTQTest): EXPECTED_OUTPUTS.add("Hello my name is nathalie, I am a young girl from") disable_exllama = True + disable_exllamav2 = True desc_act = True def test_generate_quality(self): @@ -178,7 +183,7 @@ def test_exllama_serialization(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False + empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, disable_exllamav2=True ) self.check_inference_correctness(quantized_model_from_saved) @@ -197,7 +202,7 @@ def test_exllama_max_input_length(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, max_input_length=4028 + empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, max_input_length=4028, disable_exllamav2=True ) prompt = "I am in Paris and" * 1000 @@ -213,6 +218,40 @@ def test_exllama_max_input_length(self): quantized_model_from_saved.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3) + +class GPTQTestExllamav2(GPTQTest): + desc_act = False + disable_exllama = True + disable_exllamav2 = True + + def test_generate_quality(self): + # don't need to test + pass + + def test_serialization(self): + # don't need to test + pass + + def test_exllama_serialization(self): + """ + Test the serialization of the model and the loading of the quantized weights with exllamav2 kernel + """ + from accelerate import init_empty_weights + + with tempfile.TemporaryDirectory() as tmpdirname: + self.quantizer.save(self.quantized_model, tmpdirname) + self.quantized_model.config.save_pretrained(tmpdirname) + with init_empty_weights(): + empty_model = AutoModelForCausalLM.from_config( + AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16 + ) + empty_model.tie_weights() + quantized_model_from_saved = load_quantized_model( + empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllamav2=False, + ) + self.check_inference_correctness(quantized_model_from_saved) + + class GPTQUtilsTest(unittest.TestCase): """ Test utilities From 03441b8acab14e03ed322154dfd0984cb4b1699c Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 27 Sep 2023 11:50:45 +0000 Subject: [PATCH 02/19] style --- optimum/gptq/quantizer.py | 8 +++++--- tests/gptq/test_quantization.py | 23 +++++++++++++++++------ 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index ae975c9b8e0..6ed928f10fd 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -70,7 +70,7 @@ def __init__( batch_size: int = 1, pad_token_id: Optional[int] = None, disable_exllama: bool = True, - disable_exllamav2: bool = False, + disable_exllamav2: bool = False, max_input_length: Optional[int] = None, *args, **kwargs, @@ -142,8 +142,10 @@ def __init__( if not (0 < self.damp_percent < 1): raise ValueError("damp_percent must between 0 and 1.") if not self.disable_exllamav2 and not self.disable_exllama: - logger.warning("You have activated exllama and exllamav2 backend. Setting `disable_exllama=True` and keeping `disable_exllamav2=False`") - self.disable_exllama=True + logger.warning( + "You have activated exllama and exllamav2 backend. Setting `disable_exllama=True` and keeping `disable_exllamav2=False`" + ) + self.disable_exllama = True def to_dict(self): """ diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py index ad3e36be73a..946b21e90bd 100644 --- a/tests/gptq/test_quantization.py +++ b/tests/gptq/test_quantization.py @@ -136,7 +136,11 @@ def test_serialization(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=self.disable_exllama, disable_exllamav2=self.disable_exllamav2 + empty_model, + save_folder=tmpdirname, + device_map={"": 0}, + disable_exllama=self.disable_exllama, + disable_exllamav2=self.disable_exllamav2, ) self.check_inference_correctness(quantized_model_from_saved) @@ -202,7 +206,12 @@ def test_exllama_max_input_length(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, max_input_length=4028, disable_exllamav2=True + empty_model, + save_folder=tmpdirname, + device_map={"": 0}, + disable_exllama=False, + max_input_length=4028, + disable_exllamav2=True, ) prompt = "I am in Paris and" * 1000 @@ -218,12 +227,11 @@ def test_exllama_max_input_length(self): quantized_model_from_saved.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3) - class GPTQTestExllamav2(GPTQTest): desc_act = False disable_exllama = True disable_exllamav2 = True - + def test_generate_quality(self): # don't need to test pass @@ -247,11 +255,14 @@ def test_exllama_serialization(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllamav2=False, + empty_model, + save_folder=tmpdirname, + device_map={"": 0}, + disable_exllamav2=False, ) self.check_inference_correctness(quantized_model_from_saved) - + class GPTQUtilsTest(unittest.TestCase): """ Test utilities From 80d085e239540f103c8695618df6811ab592a40f Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 27 Sep 2023 14:21:06 +0000 Subject: [PATCH 03/19] fix doc --- docs/source/llm_quantization/usage_guides/quantization.mdx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/llm_quantization/usage_guides/quantization.mdx b/docs/source/llm_quantization/usage_guides/quantization.mdx index 5b669841829..ece7a69be92 100644 --- a/docs/source/llm_quantization/usage_guides/quantization.mdx +++ b/docs/source/llm_quantization/usage_guides/quantization.mdx @@ -86,7 +86,7 @@ from accelerate import init_empty_weights with init_empty_weights(): empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) empty_model.tie_weights() -quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False) +quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False, disable_exllamav2=True) ``` With the release of the exllamav2 kernel, you can get faster inference speed compared to the exllama kernels. You just need to @@ -104,6 +104,9 @@ quantized_model = load_quantized_model(empty_model, save_folder=save_folder, dev ``` Note that only 4-bit models are supported with exllama/exllamav2 kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft. + +You can find the benchmark of these kernels [here](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark) + #### Fine-tune a quantized model With the official support of adapters in the Hugging Face ecosystem, you can fine-tune models that have been quantized with GPTQ. From 0c53c2f4a9334e32b1606997281528faa31c0f63 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 27 Sep 2023 16:03:14 +0000 Subject: [PATCH 04/19] simplify script --- tests/benchmark/benchmark_gptq.py | 258 ++++++++++++++++-------------- 1 file changed, 139 insertions(+), 119 deletions(-) diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py index 06af05056a3..369e6922e8e 100644 --- a/tests/benchmark/benchmark_gptq.py +++ b/tests/benchmark/benchmark_gptq.py @@ -1,12 +1,10 @@ import argparse import gc -import json import os import time import numpy as np import torch -from accelerate import init_empty_weights from memory_tracker import MemoryTracker from tqdm import tqdm from transformers import ( @@ -16,11 +14,11 @@ AutoTokenizer, BitsAndBytesConfig, GenerationConfig, + GPTQConfig ) from optimum.exporters import TasksManager -from optimum.gptq import load_quantized_model - +from auto_gptq.utils import Perplexity def get_parser(): parser = argparse.ArgumentParser() @@ -45,13 +43,7 @@ def get_parser(): parser.add_argument( "--model", type=str, - help="Model to benchmark (in the non-quantized case), or reference architecture corresponding to the quantized model (GPTQ case)", - ) - parser.add_argument( - "--gptq-model", - type=str, - default=None, - help="Path to a local GPTQ model.", + help="Model to benchmark", ) parser.add_argument( "--prompt-length", @@ -90,6 +82,27 @@ def get_parser(): action="store_true", help="Disable Exllama kernel, to rather use the AutoGPTQ CUDA (act-order case) or CUDA-old (no act-order case) kernels.", ) + parser.add_argument( + "--disable-exllamav2", + action="store_true", + help="Disable Exllama kernel, to rather use the AutoGPTQ CUDA (act-order case) or CUDA-old (no act-order case) kernels.", + ) + parser.add_argument( + "--generate", + action="store_true", + help="Calculate the generate speed (prompt processing + token generation)", + ) + parser.add_argument( + "--ppl", + action="store_true", + help="Calculate the perplexity on wikitext2 dataset", + ) + parser.add_argument( + "--revision", + default=None, + help="Revision of the model to benchmark", + ) + return parser @@ -266,7 +279,7 @@ def benchmark_memory( device = torch.device("cuda:0") memory_tracker = MemoryTracker() -tokenizer = AutoTokenizer.from_pretrained(args.model) +tokenizer = AutoTokenizer.from_pretrained(args.model,revision=args.revision, use_fast=False) if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @@ -288,46 +301,14 @@ def benchmark_memory( else: is_decoder = False -act_order = None -bits = None -group_size = None -kernel = None -if args.gptq: - if not args.gptq_model: - raise ValueError("The argument --gptq-model needs to be provided when benchmarking GPTQ.") - - with open(os.path.join(args.gptq_model, "quantization_config.json"), "r", encoding="utf-8") as f: - quantize_config_dict = json.load(f) - - act_order = quantize_config_dict["desc_act"] - bits = quantize_config_dict["bits"] - group_size = quantize_config_dict["group_size"] - - if not args.disable_exllama: - # Exllama kernel can handle both the act-order / no act-order cases. - kernel = "exllama" - elif act_order: - kernel = "autotogptq-cuda" - else: - kernel = "autogptq-cuda-old" - load_start = time.time_ns() if args.gptq: - with init_empty_weights(): - empty_model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.float16) - empty_model.tie_weights() - model = load_quantized_model( - empty_model, - save_folder=args.gptq_model, - state_dict_name="model.safetensors", - device_map="auto", - disable_exllama=args.disable_exllama, - ) + quantization_config = GPTQConfig(bits=4, disable_exllama=args.disable_exllama, disable_exllamav2=args.disable_exllamav2) + model = autoclass.from_pretrained(args.model,revision=args.revision, quantization_config=quantization_config, torch_dtype=torch.float16, device_map="auto") elif args.bitsandbytes: quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="fp4", bnb_4bit_compute_dtype=torch.float16 ) - model = autoclass.from_pretrained( args.model, quantization_config=quantization_config, device_map="auto", torch_dtype=torch.float16 ) @@ -337,6 +318,27 @@ def benchmark_memory( torch.cuda.synchronize() load_end = time.time_ns() +act_order = None +bits = None +group_size = None +kernel = None + +if args.gptq: + quantization_config_dict = model.config.quantization_config.to_dict() + act_order = quantization_config_dict["desc_act"] + bits = quantization_config_dict["bits"] + group_size = quantization_config_dict["group_size"] + + if not args.disable_exllamav2: + kernel = "exllamav2" + elif not args.disable_exllama: + # Exllama kernel can handle both the act-order / no act-order cases. + kernel = "exllama" + elif act_order: + kernel = "autotogptq-cuda" + else: + kernel = "autogptq-cuda-old" + load_time = (load_end - load_start) * 1e-9 print(f"Model load time: {load_time:.1f} s") @@ -364,82 +366,100 @@ def benchmark_memory( file_name = file_name + "_noquant" quantization = None -file_name = file_name + ".csv" -output_file = open(file_name, "w") -header = "quantization, act_order, bits, group_size, kernel, num_batches, batch_size, prompt_length, new_tokens, Load time (s), Per-token latency (ms), Throughput (tok/s), Max memory (MB)\n" -output_file.write(header) - -latencies = {} -throughputs = {} -all_max_mem = {} -print( - "WARNING: The reported peak memory is only a rough estimate, and can NOT be precisely relied upon to estimate an OOM limit." -) - -for batch_size in tqdm(batch_sizes): - for prompt_length in tqdm(prompt_lengths): - for new_token in tqdm(new_tokens): - print(f"---- Running: batch_size={batch_size}, prompt_length={prompt_length}, new_tokens={new_token}") - - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() - - input_ids = torch.randint(1, model.config.vocab_size - 1, size=(batch_size, prompt_length)).to(device) - masks = torch.ones(batch_size, prompt_length, dtype=torch.int32).to(device) +if args.ppl: + output_file = open(file_name + "_perplexity.csv", "w") + header = "quantization, act_order, bits, group_size, kernel, perplexity\n" + output_file.write(header) + ppl = Perplexity(model, tokenizer) + ppl_value = np.mean(ppl.calculate_perplexity()) + line = "{},{},{},{},{},{}\n".format( + quantization, + act_order, + bits, + group_size, + kernel, + f"{ppl_value:.2f}", + ) + print(header) + print(line) + output_file.write(line) + output_file.close() + +if args.generate: + output_file = open(file_name + ".csv", "w") + header = "quantization, act_order, bits, group_size, kernel, num_batches, batch_size, prompt_length, new_tokens, Load time (s), Per-token latency (ms), Throughput (tok/s), Max memory (MB)\n" + output_file.write(header) + + latencies = {} + throughputs = {} + all_max_mem = {} + print( + "WARNING: The reported peak memory is only a rough estimate, and can NOT be precisely relied upon to estimate an OOM limit." + ) - with torch.no_grad(): - max_mem = benchmark_memory( - model, - input_ids, - masks, - args.num_batches, - is_decoder, - new_token, - tokenizer.pad_token_id, - memory_tracker=memory_tracker, + for batch_size in tqdm(batch_sizes): + for prompt_length in tqdm(prompt_lengths): + for new_token in tqdm(new_tokens): + print(f"---- Running: batch_size={batch_size}, prompt_length={prompt_length}, new_tokens={new_token}") + + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + + input_ids = torch.randint(1, model.config.vocab_size - 1, size=(batch_size, prompt_length)).to(device) + masks = torch.ones(batch_size, prompt_length, dtype=torch.int32).to(device) + + with torch.no_grad(): + max_mem = benchmark_memory( + model, + input_ids, + masks, + args.num_batches, + is_decoder, + new_token, + tokenizer.pad_token_id, + memory_tracker=memory_tracker, + ) + + mean_latency = benchmark_latency( + model, + input_ids, + masks, + args.num_batches, + is_decoder, + new_token, + tokenizer.pad_token_id, + memory_tracker=memory_tracker, + ) + + index = (batch_size, prompt_length, new_token) + + per_token_latency = mean_latency / new_token + latencies[index] = per_token_latency + + throughput = batch_size / (per_token_latency * 1e-3) + throughputs[index] = throughput + all_max_mem[index] = max_mem + + print( + f"Latency per token: {per_token_latency:.3f} ms, throughput: {throughput:.3f} tok/s, peak mem: {max_mem:.2f} MB" ) - mean_latency = benchmark_latency( - model, - input_ids, - masks, + line = "{},{},{},{},{},{},{},{},{},{},{},{},{}\n".format( + quantization, + act_order, + bits, + group_size, + kernel, args.num_batches, - is_decoder, + batch_size, + prompt_length, new_token, - tokenizer.pad_token_id, - memory_tracker=memory_tracker, + f"{load_time:.2f}", + f"{per_token_latency:.2f}", + f"{throughput:.2f}", + f"{max_mem:.2f}", ) - - index = (batch_size, prompt_length, new_token) - - per_token_latency = mean_latency / new_token - latencies[index] = per_token_latency - - throughput = batch_size / (per_token_latency * 1e-3) - throughputs[index] = throughput - all_max_mem[index] = max_mem - - print( - f"Latency per token: {per_token_latency:.3f} ms, throughput: {throughput:.3f} tok/s, peak mem: {max_mem:.2f} MB" - ) - - line = "{},{},{},{},{},{},{},{},{},{},{},{},{}\n".format( - quantization, - act_order, - bits, - group_size, - kernel, - args.num_batches, - batch_size, - prompt_length, - new_token, - f"{load_time:.2f}", - f"{per_token_latency:.2f}", - f"{throughput:.2f}", - f"{max_mem:.2f}", - ) - print(header) - print(line) - output_file.write(line) - -output_file.close() + print(header) + print(line) + output_file.write(line) + output_file.close() From 216213e46e094de9d72614c09b058dceb1b35020 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 27 Sep 2023 16:18:06 +0000 Subject: [PATCH 05/19] style --- tests/benchmark/benchmark_gptq.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py index 369e6922e8e..45fdc262cee 100644 --- a/tests/benchmark/benchmark_gptq.py +++ b/tests/benchmark/benchmark_gptq.py @@ -5,6 +5,7 @@ import numpy as np import torch +from auto_gptq.utils import Perplexity from memory_tracker import MemoryTracker from tqdm import tqdm from transformers import ( @@ -14,11 +15,11 @@ AutoTokenizer, BitsAndBytesConfig, GenerationConfig, - GPTQConfig + GPTQConfig, ) from optimum.exporters import TasksManager -from auto_gptq.utils import Perplexity + def get_parser(): parser = argparse.ArgumentParser() @@ -102,7 +103,7 @@ def get_parser(): default=None, help="Revision of the model to benchmark", ) - + return parser @@ -279,7 +280,7 @@ def benchmark_memory( device = torch.device("cuda:0") memory_tracker = MemoryTracker() -tokenizer = AutoTokenizer.from_pretrained(args.model,revision=args.revision, use_fast=False) +tokenizer = AutoTokenizer.from_pretrained(args.model, revision=args.revision, use_fast=False) if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @@ -303,8 +304,16 @@ def benchmark_memory( load_start = time.time_ns() if args.gptq: - quantization_config = GPTQConfig(bits=4, disable_exllama=args.disable_exllama, disable_exllamav2=args.disable_exllamav2) - model = autoclass.from_pretrained(args.model,revision=args.revision, quantization_config=quantization_config, torch_dtype=torch.float16, device_map="auto") + quantization_config = GPTQConfig( + bits=4, disable_exllama=args.disable_exllama, disable_exllamav2=args.disable_exllamav2 + ) + model = autoclass.from_pretrained( + args.model, + revision=args.revision, + quantization_config=quantization_config, + torch_dtype=torch.float16, + device_map="auto", + ) elif args.bitsandbytes: quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="fp4", bnb_4bit_compute_dtype=torch.float16 @@ -338,7 +347,7 @@ def benchmark_memory( kernel = "autotogptq-cuda" else: kernel = "autogptq-cuda-old" - + load_time = (load_end - load_start) * 1e-9 print(f"Model load time: {load_time:.1f} s") From dadc6dc9e3f4001dfd55c68fef0acc028ffdbe79 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 27 Sep 2023 16:54:58 +0000 Subject: [PATCH 06/19] update perplexity measure --- tests/benchmark/README.md | 59 +++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md index e1fb1f01dd5..ea6dedc1523 100644 --- a/tests/benchmark/README.md +++ b/tests/benchmark/README.md @@ -11,22 +11,20 @@ The results below are for AutoGPTQ 0.4.2, PyTorch 2.0.1, bitsandbytes 0.41.1, tr Run ```shell -git clone --branch main https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ -cd Llama-2-13B-chat-GPTQ -mv gptq_model-4bit-128g.safetensors model.safetensors -mv quantize_config.json quantization_config.json - # pytorch fp16 -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation --generate + +# GPTQ with exllamav2 kernel (int4/fp16) +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --generate # GPTQ with exllama kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model /path/to/Llama-2-13B-chat-GPTQ/ --sweep --num-batches 4 --gptq --task text-generation +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --disable-exllamav2 --task text-generation --generate # GPTQ without exllama kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model /path/to/Llama-2-13B-chat-GPTQ/ --sweep --num-batches 4 --gptq --task text-generation --disable-exllama +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --disable-exllama --disable-exllamav2 --generate # using bitsandbytes fp4/fp16 scheme -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation --bitsandbytes +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation --bitsandbytes --generate ``` Here are results obtained on a single NVIDIA A100-SXM4-80GB GPU. We use a prompt length of 512, and generate exactly 512 new tokens. Each generation is repeated for 4 batches, and metrics are averaged over the number of batches and generation length. @@ -88,16 +86,20 @@ Run ```shell # pytorch fp16 -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --generate + +# GPTQ with exllamav2 kernel (int4/fp16) +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --generate + +# GPTQ with exllamav kernel (int4/fp16) +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --disable-exllamav2 --sweep --num-batches 10 --gptq --task text-generation --prefill --generate -# GPTQ with exllama kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model ../../../Llama-2-13B-chat-GPTQ/ --sweep --num-batches 10 --gptq --task text-generation --prefill # GPTQ without exllama kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --gptq-model ../../../Llama-2-13B-chat-GPTQ/ --sweep --num-batches 10 --gptq --task text-generation --prefill --disable-exllama +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --disable-exllama --disable-exllamav2 --generate # using bitsandbytes fp4/fp16 scheme -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model daryl149/llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --bitsandbytes +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --bitsandbytes --generate ``` The benchmark below is for a prompt length of 512, measuring only the prefill step on a single NVIDIA A100-SXM4-80GB GPU. The forward is repeated 10 times. This benchmark typically corresponds to the forward during training (to the difference that here `generate` is called, which has some overhead). @@ -146,3 +148,32 @@ The benchmark below is for a prompt length of 512, measuring only the prefill st |gptq |False |4 |128 |exllama |10 |16 |512 |1 |38.35 |1280.25 |12.50 |17203.22 | |gptq |False |4 |128 |autogptq-cuda-old|10 |16 |512 |1 |43.94 |1533.54 |10.43 |17060.76 | |bitsandbytes|None|None|None|None|512|1 |37.46|1256.88|12.73|17737.95| + +## Perplexity benchmark results + +Run + +```shell +# pytorch fp16 +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --task text-generation --ppl + +# GPTQ with exllamav2 kernel (int4/fp16) +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --ppl + +# GPTQ with exllama kernel (int4/fp16) +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --disable-exllamav2 --task text-generation --ppl + +# GPTQ without exllama kernel (int4/fp16) +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --disable-exllama --disable-exllamav2 --ppl + +# using bitsandbytes fp4/fp16 scheme +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf ---task text-generation --bitsandbytes --ppl +``` + +| quantization | act_order | bits | group_size | kernel | perplexity | +|--------------|-----------|------|------------|------------------|------------| +| None | None | None | None | None | 6.61 | +| gptq | True | 4 | 128 | exllamav2 | 6.77 | +| gptq | True | 4 | 128 | exllama | 6.77 | +| gptq | True | 4 | 128 | autogptq-cuda-old| 6.77 | +| bitsandbytes | None | 4 | None | None | 6.78 | \ No newline at end of file From cf4019d685e133f6c6b36d55176f85494336c487 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 27 Sep 2023 17:01:37 +0000 Subject: [PATCH 07/19] Revert "Merge branch 'add_exllamav2' into update-benchmark-gptq" This reverts commit f2dbdc2ea13183c353dfa22135d2a7f401a3dbbb, reversing changes made to 216213e46e094de9d72614c09b058dceb1b35020. --- .../usage_guides/quantization.mdx | 20 +------ optimum/gptq/quantizer.py | 47 ++++------------ tests/gptq/test_quantization.py | 56 +------------------ 3 files changed, 17 insertions(+), 106 deletions(-) diff --git a/docs/source/llm_quantization/usage_guides/quantization.mdx b/docs/source/llm_quantization/usage_guides/quantization.mdx index ece7a69be92..2ec8d1f6683 100644 --- a/docs/source/llm_quantization/usage_guides/quantization.mdx +++ b/docs/source/llm_quantization/usage_guides/quantization.mdx @@ -86,26 +86,10 @@ from accelerate import init_empty_weights with init_empty_weights(): empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) empty_model.tie_weights() -quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False, disable_exllamav2=True) +quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False) ``` -With the release of the exllamav2 kernel, you can get faster inference speed compared to the exllama kernels. You just need to -pass `disable_exllamav2` in [`~optimum.gptq.load_quantized_model`]: - -```py -from optimum.gptq import GPTQQuantizer, load_quantized_model -import torch - -from accelerate import init_empty_weights -with init_empty_weights(): - empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) -empty_model.tie_weights() -quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllamav2=False) -``` - -Note that only 4-bit models are supported with exllama/exllamav2 kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft. - -You can find the benchmark of these kernels [here](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark) +Note that only 4-bit models are supported with exllama kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft. #### Fine-tune a quantized model diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 6ed928f10fd..70e8dfa954e 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -69,8 +69,7 @@ def __init__( module_name_preceding_first_block: Optional[List[str]] = None, batch_size: int = 1, pad_token_id: Optional[int] = None, - disable_exllama: bool = True, - disable_exllamav2: bool = False, + disable_exllama: bool = False, max_input_length: Optional[int] = None, *args, **kwargs, @@ -108,10 +107,8 @@ def __init__( The batch size of the dataset pad_token_id (`Optional[int]`, defaults to `None`): The pad token id. Needed to prepare the dataset when `batch_size` > 1. - disable_exllama (`bool`, defaults to `True`): + disable_exllama (`bool`, defaults to `False`): Whether to use exllama backend. Only works with `bits` = 4. - disable_exllamav2 (`bool`, defaults to `False`): - Whether to use exllamav2 backend. Only works with `bits` = 4. max_input_length (`Optional[int]`, defaults to `None`): The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length. It is specific to the exllama backend with act-order. @@ -131,7 +128,6 @@ def __init__( self.batch_size = batch_size self.pad_token_id = pad_token_id self.disable_exllama = disable_exllama - self.disable_exllamav2 = disable_exllamav2 self.max_input_length = max_input_length self.quant_method = QuantizationMethod.GPTQ @@ -141,11 +137,6 @@ def __init__( raise ValueError("group_size must be greater than 0 or equal to -1") if not (0 < self.damp_percent < 1): raise ValueError("damp_percent must between 0 and 1.") - if not self.disable_exllamav2 and not self.disable_exllama: - logger.warning( - "You have activated exllama and exllamav2 backend. Setting `disable_exllama=True` and keeping `disable_exllamav2=False`" - ) - self.disable_exllama = True def to_dict(self): """ @@ -214,7 +205,6 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st group_size=self.group_size, bits=self.bits, disable_exllama=self.disable_exllama, - disable_exllamav2=self.disable_exllamav2, ) if isinstance(module, QuantLinear): return @@ -450,21 +440,13 @@ def tmp(_, input, output): layer_inputs, layer_outputs = layer_outputs, [] torch.cuda.empty_cache() - if self.bits == 4: - # device not on gpu + if self.bits == 4 and not self.disable_exllama: if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])): - if not self.disable_exllama: - logger.warning( - "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`" - ) - self.disable_exllama = True - if not self.disable_exllamav2: - logger.warning( - "Found modules on cpu/disk. Using Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllamav2=True`" - ) - self.disable_exllamav2 = True - # act order and exllama - elif self.desc_act and not self.disable_exllama: + logger.warning( + "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`" + ) + self.disable_exllama = True + elif self.desc_act: logger.warning( "Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights." "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. " @@ -493,13 +475,13 @@ def post_init_model(self, model): model (`nn.Module`): The input model """ - if self.bits == 4 and (not self.disable_exllama or not self.disable_exllamav2): + if self.bits == 4 and not self.disable_exllama: if get_device(model) == torch.device("cpu") or ( hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"]) ): raise ValueError( - "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU." - "You can deactivate exllama backend by setting `disable_exllama=True` or `disable_exllamav2=True` in the quantization config object" + "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU." + "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object" ) class StoreAttr(object): @@ -532,7 +514,6 @@ def pack_model( group_size=self.group_size, bits=self.bits, disable_exllama=self.disable_exllama, - disable_exllamav2=self.disable_exllamav2, ) logger.info("Packing model...") layers = get_layers(model) @@ -598,8 +579,7 @@ def load_quantized_model( offload_folder: Optional[str] = None, offload_buffers: Optional[str] = None, offload_state_dict: bool = False, - disable_exllama: bool = True, - disable_exllamav2: bool = False, + disable_exllama: bool = False, max_input_length: Optional[int] = None, ): """ @@ -635,8 +615,6 @@ def load_quantized_model( picked contains `"disk"` values. disable_exllama (`bool`, defaults to `False`): Whether to use exllama backend. Only works with `bits` = 4. - disable_exllama (`bool`, defaults to `False`): - Whether to use exllamav2 backend. Only works with `bits` = 4. max_input_length (`Optional[int]`, defaults to `None`): The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length. It is specific to the exllama backend with act-order. @@ -670,7 +648,6 @@ def load_quantized_model( ) from err quantizer = GPTQQuantizer.from_dict(quantize_config_dict) quantizer.disable_exllama = disable_exllama - quantizer.disable_exllamav2 = disable_exllamav2 quantizer.max_input_length = max_input_length model = quantizer.convert_model(model) diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py index 946b21e90bd..53ff1a722e5 100644 --- a/tests/gptq/test_quantization.py +++ b/tests/gptq/test_quantization.py @@ -46,7 +46,6 @@ class GPTQTest(unittest.TestCase): group_size = 128 desc_act = False disable_exllama = True - disable_exllamav2 = True dataset = [ "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." @@ -70,7 +69,6 @@ def setUpClass(cls): group_size=cls.group_size, desc_act=cls.desc_act, disable_exllama=cls.disable_exllama, - disable_exllamav2=cls.disable_exllamav2, ) cls.quantized_model = cls.quantizer.quantize_model(cls.model_fp16, cls.tokenizer) @@ -98,7 +96,6 @@ def test_quantized_layers_class(self): group_size=self.group_size, bits=self.bits, disable_exllama=self.disable_exllama, - disable_exllamav2=self.disable_exllamav2, ) self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear) @@ -136,18 +133,13 @@ def test_serialization(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, - save_folder=tmpdirname, - device_map={"": 0}, - disable_exllama=self.disable_exllama, - disable_exllamav2=self.disable_exllamav2, + empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=self.disable_exllama ) self.check_inference_correctness(quantized_model_from_saved) class GPTQTestExllama(GPTQTest): disable_exllama = False - disable_exllamav2 = True EXPECTED_OUTPUTS = set() EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I") EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.") @@ -161,7 +153,6 @@ class GPTQTestActOrder(GPTQTest): EXPECTED_OUTPUTS.add("Hello my name is nathalie, I am a young girl from") disable_exllama = True - disable_exllamav2 = True desc_act = True def test_generate_quality(self): @@ -187,7 +178,7 @@ def test_exllama_serialization(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, disable_exllamav2=True + empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False ) self.check_inference_correctness(quantized_model_from_saved) @@ -206,12 +197,7 @@ def test_exllama_max_input_length(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, - save_folder=tmpdirname, - device_map={"": 0}, - disable_exllama=False, - max_input_length=4028, - disable_exllamav2=True, + empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, max_input_length=4028 ) prompt = "I am in Paris and" * 1000 @@ -227,42 +213,6 @@ def test_exllama_max_input_length(self): quantized_model_from_saved.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3) -class GPTQTestExllamav2(GPTQTest): - desc_act = False - disable_exllama = True - disable_exllamav2 = True - - def test_generate_quality(self): - # don't need to test - pass - - def test_serialization(self): - # don't need to test - pass - - def test_exllama_serialization(self): - """ - Test the serialization of the model and the loading of the quantized weights with exllamav2 kernel - """ - from accelerate import init_empty_weights - - with tempfile.TemporaryDirectory() as tmpdirname: - self.quantizer.save(self.quantized_model, tmpdirname) - self.quantized_model.config.save_pretrained(tmpdirname) - with init_empty_weights(): - empty_model = AutoModelForCausalLM.from_config( - AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16 - ) - empty_model.tie_weights() - quantized_model_from_saved = load_quantized_model( - empty_model, - save_folder=tmpdirname, - device_map={"": 0}, - disable_exllamav2=False, - ) - self.check_inference_correctness(quantized_model_from_saved) - - class GPTQUtilsTest(unittest.TestCase): """ Test utilities From 97a7c62b0cf09ad4671a4198958977143a1191cf Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 27 Sep 2023 16:20:52 +0000 Subject: [PATCH 08/19] Merge branch 'add_exllamav2' into update-benchmark-gptq --- .../usage_guides/quantization.mdx | 20 ++++++- optimum/gptq/quantizer.py | 47 ++++++++++++---- tests/gptq/test_quantization.py | 56 ++++++++++++++++++- 3 files changed, 106 insertions(+), 17 deletions(-) diff --git a/docs/source/llm_quantization/usage_guides/quantization.mdx b/docs/source/llm_quantization/usage_guides/quantization.mdx index 2ec8d1f6683..ece7a69be92 100644 --- a/docs/source/llm_quantization/usage_guides/quantization.mdx +++ b/docs/source/llm_quantization/usage_guides/quantization.mdx @@ -86,10 +86,26 @@ from accelerate import init_empty_weights with init_empty_weights(): empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) empty_model.tie_weights() -quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False) +quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False, disable_exllamav2=True) ``` -Note that only 4-bit models are supported with exllama kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft. +With the release of the exllamav2 kernel, you can get faster inference speed compared to the exllama kernels. You just need to +pass `disable_exllamav2` in [`~optimum.gptq.load_quantized_model`]: + +```py +from optimum.gptq import GPTQQuantizer, load_quantized_model +import torch + +from accelerate import init_empty_weights +with init_empty_weights(): + empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) +empty_model.tie_weights() +quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllamav2=False) +``` + +Note that only 4-bit models are supported with exllama/exllamav2 kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft. + +You can find the benchmark of these kernels [here](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark) #### Fine-tune a quantized model diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 70e8dfa954e..6ed928f10fd 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -69,7 +69,8 @@ def __init__( module_name_preceding_first_block: Optional[List[str]] = None, batch_size: int = 1, pad_token_id: Optional[int] = None, - disable_exllama: bool = False, + disable_exllama: bool = True, + disable_exllamav2: bool = False, max_input_length: Optional[int] = None, *args, **kwargs, @@ -107,8 +108,10 @@ def __init__( The batch size of the dataset pad_token_id (`Optional[int]`, defaults to `None`): The pad token id. Needed to prepare the dataset when `batch_size` > 1. - disable_exllama (`bool`, defaults to `False`): + disable_exllama (`bool`, defaults to `True`): Whether to use exllama backend. Only works with `bits` = 4. + disable_exllamav2 (`bool`, defaults to `False`): + Whether to use exllamav2 backend. Only works with `bits` = 4. max_input_length (`Optional[int]`, defaults to `None`): The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length. It is specific to the exllama backend with act-order. @@ -128,6 +131,7 @@ def __init__( self.batch_size = batch_size self.pad_token_id = pad_token_id self.disable_exllama = disable_exllama + self.disable_exllamav2 = disable_exllamav2 self.max_input_length = max_input_length self.quant_method = QuantizationMethod.GPTQ @@ -137,6 +141,11 @@ def __init__( raise ValueError("group_size must be greater than 0 or equal to -1") if not (0 < self.damp_percent < 1): raise ValueError("damp_percent must between 0 and 1.") + if not self.disable_exllamav2 and not self.disable_exllama: + logger.warning( + "You have activated exllama and exllamav2 backend. Setting `disable_exllama=True` and keeping `disable_exllamav2=False`" + ) + self.disable_exllama = True def to_dict(self): """ @@ -205,6 +214,7 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st group_size=self.group_size, bits=self.bits, disable_exllama=self.disable_exllama, + disable_exllamav2=self.disable_exllamav2, ) if isinstance(module, QuantLinear): return @@ -440,13 +450,21 @@ def tmp(_, input, output): layer_inputs, layer_outputs = layer_outputs, [] torch.cuda.empty_cache() - if self.bits == 4 and not self.disable_exllama: + if self.bits == 4: + # device not on gpu if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])): - logger.warning( - "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`" - ) - self.disable_exllama = True - elif self.desc_act: + if not self.disable_exllama: + logger.warning( + "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`" + ) + self.disable_exllama = True + if not self.disable_exllamav2: + logger.warning( + "Found modules on cpu/disk. Using Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllamav2=True`" + ) + self.disable_exllamav2 = True + # act order and exllama + elif self.desc_act and not self.disable_exllama: logger.warning( "Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights." "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. " @@ -475,13 +493,13 @@ def post_init_model(self, model): model (`nn.Module`): The input model """ - if self.bits == 4 and not self.disable_exllama: + if self.bits == 4 and (not self.disable_exllama or not self.disable_exllamav2): if get_device(model) == torch.device("cpu") or ( hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"]) ): raise ValueError( - "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU." - "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object" + "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU." + "You can deactivate exllama backend by setting `disable_exllama=True` or `disable_exllamav2=True` in the quantization config object" ) class StoreAttr(object): @@ -514,6 +532,7 @@ def pack_model( group_size=self.group_size, bits=self.bits, disable_exllama=self.disable_exllama, + disable_exllamav2=self.disable_exllamav2, ) logger.info("Packing model...") layers = get_layers(model) @@ -579,7 +598,8 @@ def load_quantized_model( offload_folder: Optional[str] = None, offload_buffers: Optional[str] = None, offload_state_dict: bool = False, - disable_exllama: bool = False, + disable_exllama: bool = True, + disable_exllamav2: bool = False, max_input_length: Optional[int] = None, ): """ @@ -615,6 +635,8 @@ def load_quantized_model( picked contains `"disk"` values. disable_exllama (`bool`, defaults to `False`): Whether to use exllama backend. Only works with `bits` = 4. + disable_exllama (`bool`, defaults to `False`): + Whether to use exllamav2 backend. Only works with `bits` = 4. max_input_length (`Optional[int]`, defaults to `None`): The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length. It is specific to the exllama backend with act-order. @@ -648,6 +670,7 @@ def load_quantized_model( ) from err quantizer = GPTQQuantizer.from_dict(quantize_config_dict) quantizer.disable_exllama = disable_exllama + quantizer.disable_exllamav2 = disable_exllamav2 quantizer.max_input_length = max_input_length model = quantizer.convert_model(model) diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py index 53ff1a722e5..946b21e90bd 100644 --- a/tests/gptq/test_quantization.py +++ b/tests/gptq/test_quantization.py @@ -46,6 +46,7 @@ class GPTQTest(unittest.TestCase): group_size = 128 desc_act = False disable_exllama = True + disable_exllamav2 = True dataset = [ "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." @@ -69,6 +70,7 @@ def setUpClass(cls): group_size=cls.group_size, desc_act=cls.desc_act, disable_exllama=cls.disable_exllama, + disable_exllamav2=cls.disable_exllamav2, ) cls.quantized_model = cls.quantizer.quantize_model(cls.model_fp16, cls.tokenizer) @@ -96,6 +98,7 @@ def test_quantized_layers_class(self): group_size=self.group_size, bits=self.bits, disable_exllama=self.disable_exllama, + disable_exllamav2=self.disable_exllamav2, ) self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear) @@ -133,13 +136,18 @@ def test_serialization(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=self.disable_exllama + empty_model, + save_folder=tmpdirname, + device_map={"": 0}, + disable_exllama=self.disable_exllama, + disable_exllamav2=self.disable_exllamav2, ) self.check_inference_correctness(quantized_model_from_saved) class GPTQTestExllama(GPTQTest): disable_exllama = False + disable_exllamav2 = True EXPECTED_OUTPUTS = set() EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I") EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.") @@ -153,6 +161,7 @@ class GPTQTestActOrder(GPTQTest): EXPECTED_OUTPUTS.add("Hello my name is nathalie, I am a young girl from") disable_exllama = True + disable_exllamav2 = True desc_act = True def test_generate_quality(self): @@ -178,7 +187,7 @@ def test_exllama_serialization(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False + empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, disable_exllamav2=True ) self.check_inference_correctness(quantized_model_from_saved) @@ -197,7 +206,12 @@ def test_exllama_max_input_length(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, max_input_length=4028 + empty_model, + save_folder=tmpdirname, + device_map={"": 0}, + disable_exllama=False, + max_input_length=4028, + disable_exllamav2=True, ) prompt = "I am in Paris and" * 1000 @@ -213,6 +227,42 @@ def test_exllama_max_input_length(self): quantized_model_from_saved.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3) +class GPTQTestExllamav2(GPTQTest): + desc_act = False + disable_exllama = True + disable_exllamav2 = True + + def test_generate_quality(self): + # don't need to test + pass + + def test_serialization(self): + # don't need to test + pass + + def test_exllama_serialization(self): + """ + Test the serialization of the model and the loading of the quantized weights with exllamav2 kernel + """ + from accelerate import init_empty_weights + + with tempfile.TemporaryDirectory() as tmpdirname: + self.quantizer.save(self.quantized_model, tmpdirname) + self.quantized_model.config.save_pretrained(tmpdirname) + with init_empty_weights(): + empty_model = AutoModelForCausalLM.from_config( + AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16 + ) + empty_model.tie_weights() + quantized_model_from_saved = load_quantized_model( + empty_model, + save_folder=tmpdirname, + device_map={"": 0}, + disable_exllamav2=False, + ) + self.check_inference_correctness(quantized_model_from_saved) + + class GPTQUtilsTest(unittest.TestCase): """ Test utilities From 62b89d954de63e88265acdb07567ff1f68aeafad Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Thu, 28 Sep 2023 15:11:17 +0000 Subject: [PATCH 09/19] fix arg in llama attention --- optimum/bettertransformer/models/attention.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py index 702aca3257b..829609cdcbd 100644 --- a/optimum/bettertransformer/models/attention.py +++ b/optimum/bettertransformer/models/attention.py @@ -583,6 +583,7 @@ def llama_forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, + padding_mask: Optional[torch.LongTensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if output_attentions is True: raise ValueError("output_attentions=True can not be supported with BetterTransformer.") From 1ef6ce523d2dfa07f3e2e2a1daad2f27e259e6cb Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Fri, 29 Sep 2023 08:51:35 +0000 Subject: [PATCH 10/19] flash_attention arg --- tests/benchmark/benchmark_gptq.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py index 45fdc262cee..57b74776ca0 100644 --- a/tests/benchmark/benchmark_gptq.py +++ b/tests/benchmark/benchmark_gptq.py @@ -103,6 +103,11 @@ def get_parser(): default=None, help="Revision of the model to benchmark", ) + parser.add_argument( + "--enable-flash", + action="store_true", + help="Use flash attention", + ) return parser @@ -124,7 +129,8 @@ def timing_cuda( start_event.record() if is_decoder: - _ = model.generate(input_ids, attention_mask=masks, generation_config=generation_config) + with torch.backends.cuda.sdp_kernel(enable_flash=args.enable_flash, enable_math=False, enable_mem_efficient=False): + _ = model.generate(input_ids, attention_mask=masks, generation_config=generation_config) else: _ = model(input_ids, masks) end_event.record() @@ -157,7 +163,9 @@ def warmup( eos_token_id=None, # This is required for min_new_tokens to actually have an effect. ) model.generation_config.eos_token_id = None # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect. - res = model.generate(input_ids, attention_mask=masks, generation_config=gen_config) + with torch.backends.cuda.sdp_kernel(enable_flash=args.enable_flash, enable_math=False, enable_mem_efficient=False): + print(input_ids) + res = model.generate(input_ids, generation_config=gen_config) assert res.shape[1] == new_tokens + input_ids.shape[1] del res else: @@ -221,7 +229,8 @@ def benchmark_memory( ) if is_decoder: - _ = model.generate(input_ids, attention_mask=masks, generation_config=gen_config) + with torch.backends.cuda.sdp_kernel(enable_flash=args.enable_flash, enable_math=False, enable_mem_efficient=False): + _ = model.generate(input_ids, attention_mask=masks, generation_config=gen_config) else: _ = model(input_ids, masks) @@ -324,7 +333,10 @@ def benchmark_memory( else: with device: model = autoclass.from_pretrained(args.model, torch_dtype=torch.float16) + +model.to_bettertransformer() torch.cuda.synchronize() + load_end = time.time_ns() act_order = None @@ -415,7 +427,8 @@ def benchmark_memory( torch.cuda.reset_peak_memory_stats() input_ids = torch.randint(1, model.config.vocab_size - 1, size=(batch_size, prompt_length)).to(device) - masks = torch.ones(batch_size, prompt_length, dtype=torch.int32).to(device) + #masks = torch.ones(batch_size, prompt_length, dtype=torch.int32).to(device) + masks=None with torch.no_grad(): max_mem = benchmark_memory( From f727313c2f19ae39a00b6ff6134f829c42b0b875 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Fri, 29 Sep 2023 08:55:35 +0000 Subject: [PATCH 11/19] Revert "Merge branch 'add_exllamav2' into update-benchmark-gptq" This reverts commit 97a7c62b0cf09ad4671a4198958977143a1191cf. --- .../usage_guides/quantization.mdx | 20 +------ optimum/bettertransformer/models/attention.py | 1 - optimum/gptq/quantizer.py | 47 ++++------------ tests/benchmark/benchmark_gptq.py | 21 ++----- tests/gptq/test_quantization.py | 56 +------------------ 5 files changed, 21 insertions(+), 124 deletions(-) diff --git a/docs/source/llm_quantization/usage_guides/quantization.mdx b/docs/source/llm_quantization/usage_guides/quantization.mdx index ece7a69be92..2ec8d1f6683 100644 --- a/docs/source/llm_quantization/usage_guides/quantization.mdx +++ b/docs/source/llm_quantization/usage_guides/quantization.mdx @@ -86,26 +86,10 @@ from accelerate import init_empty_weights with init_empty_weights(): empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) empty_model.tie_weights() -quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False, disable_exllamav2=True) +quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllama=False) ``` -With the release of the exllamav2 kernel, you can get faster inference speed compared to the exllama kernels. You just need to -pass `disable_exllamav2` in [`~optimum.gptq.load_quantized_model`]: - -```py -from optimum.gptq import GPTQQuantizer, load_quantized_model -import torch - -from accelerate import init_empty_weights -with init_empty_weights(): - empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) -empty_model.tie_weights() -quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllamav2=False) -``` - -Note that only 4-bit models are supported with exllama/exllamav2 kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft. - -You can find the benchmark of these kernels [here](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark) +Note that only 4-bit models are supported with exllama kernels for now. Furthermore, it is recommended to disable the exllama kernel when you are finetuning your model with peft. #### Fine-tune a quantized model diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py index 829609cdcbd..702aca3257b 100644 --- a/optimum/bettertransformer/models/attention.py +++ b/optimum/bettertransformer/models/attention.py @@ -583,7 +583,6 @@ def llama_forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, - padding_mask: Optional[torch.LongTensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if output_attentions is True: raise ValueError("output_attentions=True can not be supported with BetterTransformer.") diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 6ed928f10fd..70e8dfa954e 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -69,8 +69,7 @@ def __init__( module_name_preceding_first_block: Optional[List[str]] = None, batch_size: int = 1, pad_token_id: Optional[int] = None, - disable_exllama: bool = True, - disable_exllamav2: bool = False, + disable_exllama: bool = False, max_input_length: Optional[int] = None, *args, **kwargs, @@ -108,10 +107,8 @@ def __init__( The batch size of the dataset pad_token_id (`Optional[int]`, defaults to `None`): The pad token id. Needed to prepare the dataset when `batch_size` > 1. - disable_exllama (`bool`, defaults to `True`): + disable_exllama (`bool`, defaults to `False`): Whether to use exllama backend. Only works with `bits` = 4. - disable_exllamav2 (`bool`, defaults to `False`): - Whether to use exllamav2 backend. Only works with `bits` = 4. max_input_length (`Optional[int]`, defaults to `None`): The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length. It is specific to the exllama backend with act-order. @@ -131,7 +128,6 @@ def __init__( self.batch_size = batch_size self.pad_token_id = pad_token_id self.disable_exllama = disable_exllama - self.disable_exllamav2 = disable_exllamav2 self.max_input_length = max_input_length self.quant_method = QuantizationMethod.GPTQ @@ -141,11 +137,6 @@ def __init__( raise ValueError("group_size must be greater than 0 or equal to -1") if not (0 < self.damp_percent < 1): raise ValueError("damp_percent must between 0 and 1.") - if not self.disable_exllamav2 and not self.disable_exllama: - logger.warning( - "You have activated exllama and exllamav2 backend. Setting `disable_exllama=True` and keeping `disable_exllamav2=False`" - ) - self.disable_exllama = True def to_dict(self): """ @@ -214,7 +205,6 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st group_size=self.group_size, bits=self.bits, disable_exllama=self.disable_exllama, - disable_exllamav2=self.disable_exllamav2, ) if isinstance(module, QuantLinear): return @@ -450,21 +440,13 @@ def tmp(_, input, output): layer_inputs, layer_outputs = layer_outputs, [] torch.cuda.empty_cache() - if self.bits == 4: - # device not on gpu + if self.bits == 4 and not self.disable_exllama: if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])): - if not self.disable_exllama: - logger.warning( - "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`" - ) - self.disable_exllama = True - if not self.disable_exllamav2: - logger.warning( - "Found modules on cpu/disk. Using Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllamav2=True`" - ) - self.disable_exllamav2 = True - # act order and exllama - elif self.desc_act and not self.disable_exllama: + logger.warning( + "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`" + ) + self.disable_exllama = True + elif self.desc_act: logger.warning( "Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights." "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. " @@ -493,13 +475,13 @@ def post_init_model(self, model): model (`nn.Module`): The input model """ - if self.bits == 4 and (not self.disable_exllama or not self.disable_exllamav2): + if self.bits == 4 and not self.disable_exllama: if get_device(model) == torch.device("cpu") or ( hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"]) ): raise ValueError( - "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU." - "You can deactivate exllama backend by setting `disable_exllama=True` or `disable_exllamav2=True` in the quantization config object" + "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU." + "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object" ) class StoreAttr(object): @@ -532,7 +514,6 @@ def pack_model( group_size=self.group_size, bits=self.bits, disable_exllama=self.disable_exllama, - disable_exllamav2=self.disable_exllamav2, ) logger.info("Packing model...") layers = get_layers(model) @@ -598,8 +579,7 @@ def load_quantized_model( offload_folder: Optional[str] = None, offload_buffers: Optional[str] = None, offload_state_dict: bool = False, - disable_exllama: bool = True, - disable_exllamav2: bool = False, + disable_exllama: bool = False, max_input_length: Optional[int] = None, ): """ @@ -635,8 +615,6 @@ def load_quantized_model( picked contains `"disk"` values. disable_exllama (`bool`, defaults to `False`): Whether to use exllama backend. Only works with `bits` = 4. - disable_exllama (`bool`, defaults to `False`): - Whether to use exllamav2 backend. Only works with `bits` = 4. max_input_length (`Optional[int]`, defaults to `None`): The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length. It is specific to the exllama backend with act-order. @@ -670,7 +648,6 @@ def load_quantized_model( ) from err quantizer = GPTQQuantizer.from_dict(quantize_config_dict) quantizer.disable_exllama = disable_exllama - quantizer.disable_exllamav2 = disable_exllamav2 quantizer.max_input_length = max_input_length model = quantizer.convert_model(model) diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py index 57b74776ca0..45fdc262cee 100644 --- a/tests/benchmark/benchmark_gptq.py +++ b/tests/benchmark/benchmark_gptq.py @@ -103,11 +103,6 @@ def get_parser(): default=None, help="Revision of the model to benchmark", ) - parser.add_argument( - "--enable-flash", - action="store_true", - help="Use flash attention", - ) return parser @@ -129,8 +124,7 @@ def timing_cuda( start_event.record() if is_decoder: - with torch.backends.cuda.sdp_kernel(enable_flash=args.enable_flash, enable_math=False, enable_mem_efficient=False): - _ = model.generate(input_ids, attention_mask=masks, generation_config=generation_config) + _ = model.generate(input_ids, attention_mask=masks, generation_config=generation_config) else: _ = model(input_ids, masks) end_event.record() @@ -163,9 +157,7 @@ def warmup( eos_token_id=None, # This is required for min_new_tokens to actually have an effect. ) model.generation_config.eos_token_id = None # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect. - with torch.backends.cuda.sdp_kernel(enable_flash=args.enable_flash, enable_math=False, enable_mem_efficient=False): - print(input_ids) - res = model.generate(input_ids, generation_config=gen_config) + res = model.generate(input_ids, attention_mask=masks, generation_config=gen_config) assert res.shape[1] == new_tokens + input_ids.shape[1] del res else: @@ -229,8 +221,7 @@ def benchmark_memory( ) if is_decoder: - with torch.backends.cuda.sdp_kernel(enable_flash=args.enable_flash, enable_math=False, enable_mem_efficient=False): - _ = model.generate(input_ids, attention_mask=masks, generation_config=gen_config) + _ = model.generate(input_ids, attention_mask=masks, generation_config=gen_config) else: _ = model(input_ids, masks) @@ -333,10 +324,7 @@ def benchmark_memory( else: with device: model = autoclass.from_pretrained(args.model, torch_dtype=torch.float16) - -model.to_bettertransformer() torch.cuda.synchronize() - load_end = time.time_ns() act_order = None @@ -427,8 +415,7 @@ def benchmark_memory( torch.cuda.reset_peak_memory_stats() input_ids = torch.randint(1, model.config.vocab_size - 1, size=(batch_size, prompt_length)).to(device) - #masks = torch.ones(batch_size, prompt_length, dtype=torch.int32).to(device) - masks=None + masks = torch.ones(batch_size, prompt_length, dtype=torch.int32).to(device) with torch.no_grad(): max_mem = benchmark_memory( diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py index 946b21e90bd..53ff1a722e5 100644 --- a/tests/gptq/test_quantization.py +++ b/tests/gptq/test_quantization.py @@ -46,7 +46,6 @@ class GPTQTest(unittest.TestCase): group_size = 128 desc_act = False disable_exllama = True - disable_exllamav2 = True dataset = [ "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." @@ -70,7 +69,6 @@ def setUpClass(cls): group_size=cls.group_size, desc_act=cls.desc_act, disable_exllama=cls.disable_exllama, - disable_exllamav2=cls.disable_exllamav2, ) cls.quantized_model = cls.quantizer.quantize_model(cls.model_fp16, cls.tokenizer) @@ -98,7 +96,6 @@ def test_quantized_layers_class(self): group_size=self.group_size, bits=self.bits, disable_exllama=self.disable_exllama, - disable_exllamav2=self.disable_exllamav2, ) self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear) @@ -136,18 +133,13 @@ def test_serialization(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, - save_folder=tmpdirname, - device_map={"": 0}, - disable_exllama=self.disable_exllama, - disable_exllamav2=self.disable_exllamav2, + empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=self.disable_exllama ) self.check_inference_correctness(quantized_model_from_saved) class GPTQTestExllama(GPTQTest): disable_exllama = False - disable_exllamav2 = True EXPECTED_OUTPUTS = set() EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I") EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.") @@ -161,7 +153,6 @@ class GPTQTestActOrder(GPTQTest): EXPECTED_OUTPUTS.add("Hello my name is nathalie, I am a young girl from") disable_exllama = True - disable_exllamav2 = True desc_act = True def test_generate_quality(self): @@ -187,7 +178,7 @@ def test_exllama_serialization(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, disable_exllamav2=True + empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False ) self.check_inference_correctness(quantized_model_from_saved) @@ -206,12 +197,7 @@ def test_exllama_max_input_length(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, - save_folder=tmpdirname, - device_map={"": 0}, - disable_exllama=False, - max_input_length=4028, - disable_exllamav2=True, + empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, max_input_length=4028 ) prompt = "I am in Paris and" * 1000 @@ -227,42 +213,6 @@ def test_exllama_max_input_length(self): quantized_model_from_saved.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3) -class GPTQTestExllamav2(GPTQTest): - desc_act = False - disable_exllama = True - disable_exllamav2 = True - - def test_generate_quality(self): - # don't need to test - pass - - def test_serialization(self): - # don't need to test - pass - - def test_exllama_serialization(self): - """ - Test the serialization of the model and the loading of the quantized weights with exllamav2 kernel - """ - from accelerate import init_empty_weights - - with tempfile.TemporaryDirectory() as tmpdirname: - self.quantizer.save(self.quantized_model, tmpdirname) - self.quantized_model.config.save_pretrained(tmpdirname) - with init_empty_weights(): - empty_model = AutoModelForCausalLM.from_config( - AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16 - ) - empty_model.tie_weights() - quantized_model_from_saved = load_quantized_model( - empty_model, - save_folder=tmpdirname, - device_map={"": 0}, - disable_exllamav2=False, - ) - self.check_inference_correctness(quantized_model_from_saved) - - class GPTQUtilsTest(unittest.TestCase): """ Test utilities From 1eaedeb7b6128f18951fd0f784c206de5fa24ea5 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Fri, 29 Sep 2023 16:32:52 +0000 Subject: [PATCH 12/19] update benchmark prefill and generate --- tests/benchmark/README.md | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md index ea6dedc1523..02a44b7ee64 100644 --- a/tests/benchmark/README.md +++ b/tests/benchmark/README.md @@ -40,6 +40,7 @@ Bitsandbytes uses the fp4 scheme, with the compute in fp16. |quantization |act_order|bits|group_size|kernel|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Peak memory (MB)| |-----|---------|----|----------|------|-------------|----------------------|------------------|----------------| |None|None |None|None |None |26.0 |36.958 |27.058 |29152.98 | +| gptq | False | 4 | 128 | exllamav2 | 36.07 | 32.25 | 31.01 | 11313.75 | |gptq |False |4 |128 |exllama|36.2 |33.711 |29.663 |10484.34 | |gptq |False |4 |128 |autogptq-cuda-old|36.2 |46.44 |21.53 |10344.62 | |bitsandbytes|None |None|None |None |37.64 |52.00 |19.23 |11018.36 | @@ -49,6 +50,7 @@ Bitsandbytes uses the fp4 scheme, with the compute in fp16. |quantization |act_order|bits|group_size|kernel|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Peak memory (MB)| |-----|---------|----|----------|------|-------------|----------------------|------------------|----------------| |None|None |None|None |None |26.0 |37.35 |53.53 |30831.09 | +| gptq | False | 4 | 128 | exllamav2 | 36.07 | 35.81 | 55.85 | 12112.42 | |gptq |False |4 |128 |exllama|36.2 |37.25 |53.68 |12162.43 | |gptq |False |4 |128 |autogptq-cuda-old|36.2 |47.41 |42.18 |12020.34 | |bitsandbytes|None |None|None |None |37.64 |74.62 |26.80 |12834.84 | @@ -58,6 +60,7 @@ Bitsandbytes uses the fp4 scheme, with the compute in fp16. |quantization |act_order|bits|group_size|kernel |Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Peak memory (MB)| |-----|---------|----|----------|-----------------|-------------|----------------------|------------------|----------------| |None|None |None|None |None |26.0 |37.89 |105.55 |34187.22 | +| gptq | False | 4 | 128 | exllamav2 | 36.07 | 36.04 | 110.98 | 16387.19 | |gptq |False |4 |128 |exllama |36.2 |54.14 |73.87 |15518.55 | |gptq |False |4 |128 |autogptq-cuda-old|36.2 |60.98 |65.59 |15374.67 | |bitsandbytes|None |None|None |None |37.64 |80.24 |49.85 |16187.69 | @@ -67,6 +70,7 @@ Bitsandbytes uses the fp4 scheme, with the compute in fp16. |quantization |act_order|bits|group_size|kernel|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Peak memory (MB)| |-----|---------|----|----------|------|-------------|----------------------|------------------|----------------| |None|None |None|None |None |26.0 |47.37 |168.86 |40327.62 | +| gptq | False | 4 | 128 | exllamav2 | 36.07 | 47.31 | 169.11 | 22463.02 | |gptq |False |4 |128 |exllama|36.2 |73.57 |108.73 |21864.56 | |gptq |False |4 |128 |autogptq-cuda-old|36.2 |104.44 |76.59 |20987.68 | |bitsandbytes|None |None|None |None |37.64 |91.29 |87.63 |22894.02 | @@ -76,6 +80,7 @@ Bitsandbytes uses the fp4 scheme, with the compute in fp16. |quantization |act_order|bits|group_size|kernel|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Peak memory (MB)| |-----|---------|----|----------|------|-------------|----------------------|------------------|----------------| |None|None |None|None |None |26.0 |69.94 |228.76 |53986.51 | +| gptq | False | 4 | 128 | exllamav2 | 36.07 | 83.09 | 192.55 | 35740.95 | |gptq |False |4 |128 |exllama|36.2 |95.41 |167.68 |34777.04 | |gptq |False |4 |128 |autogptq-cuda-old|36.2 |192.48 |83.12 |35497.62 | |bitsandbytes|None |None|None |None |37.64 |113.98 |140.38 |35532.37 | @@ -109,6 +114,7 @@ The benchmark below is for a prompt length of 512, measuring only the prefill st |quantization |act_order|bits|group_size|kernel |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)| |-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------| |None|None |None|None |None |512 |1 |27.22 |96.38 |10.38 |27999.54 | +| gptq | False | 4 | 128 | exllamav2 | 512 | 1 | 6.63 | 116.07 | 8.62 | 10260.35 | |gptq |False |4 |128 |exllama |512 |1 |38.35 |112.54 |8.89 |9330.89 | |gptq |False |4 |128 |autogptq-cuda-old|512 |1 |43.94 |368.13 |2.72 |9474.19 | |bitsandbytes|None|None|None|None|512|1 |37.46|139.17 |7.19 |9952.65 | @@ -118,6 +124,7 @@ The benchmark below is for a prompt length of 512, measuring only the prefill st |quantization |act_order|bits|group_size|kernel |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)| |-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------| |None|None |None|None |None |512 |1 |27.22 |169.95 |11.77 |28524.37 | +| gptq | False | 4 | 128 | exllamav2 | 512 | 1 | 6.63 | 212.07 | 9.43 | 10783.60 | |gptq |False |4 |128 |exllama |512 |1 |38.35 |190.44 |10.50 |9855.71 | |gptq |False |4 |128 |autogptq-cuda-old|512 |1 |43.94 |443.80 |4.51 |9928.23 | |bitsandbytes|None|None|None|None|512|1 |37.46|212.76 |9.40 |10421.89| @@ -127,6 +134,7 @@ The benchmark below is for a prompt length of 512, measuring only the prefill st |quantization |act_order|bits|group_size|kernel |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)| |-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------| |None|None |None|None |None |512 |1 |27.22 |305.99 |13.07 |29574.01 | +| gptq | False | 4 | 128 | exllamav2 | 512 | 1 | 6.63 | 385.58 | 10.37 | 11829.59 | |gptq |False |4 |128 |exllama |512 |1 |38.35 |345.54 |11.58 |10905.35 | |gptq |False |4 |128 |autogptq-cuda-old|512 |1 |43.94 |597.24 |6.70 |10838.42 | |bitsandbytes|None|None|None|None|512|1 |37.46|349.18 |11.46|11440.08| @@ -136,17 +144,19 @@ The benchmark below is for a prompt length of 512, measuring only the prefill st |quantization |act_order|bits|group_size|kernel |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)| |-----|---------|----|----------|-----------------|-------------|----------|-------------|----------------------|------------------|---------------| |None|None |None|None |None |512 |1 |27.22 |600.47 |13.32 |31673.30 | +| gptq | False | 4 | 128 | exllamav2 | 512 | 1 | 6.63 | 753.06 | 10.62 | 13920.50 | |gptq |False |4 |128 |exllama |512 |1 |38.35 |659.61 |12.13 |13004.64 | |gptq |False |4 |128 |autogptq-cuda-old|512 |1 |43.94 |909.09 |8.80 |12862.18 | |bitsandbytes|None|None|None|None|512|1 |37.46|643.42 |12.43|13539.37| ### Batch size = 16 -|quantization |act_order|bits|group_size|kernel |num_batches|batch_size|prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)| -|-----|---------|----|----------|-----------------|-----------|----------|-------------|----------|-------------|----------------------|------------------|---------------| -|None|None |None|None |None |10 |16 |512 |1 |27.22 |1209.07 |13.23 |35871.88 | -|gptq |False |4 |128 |exllama |10 |16 |512 |1 |38.35 |1280.25 |12.50 |17203.22 | -|gptq |False |4 |128 |autogptq-cuda-old|10 |16 |512 |1 |43.94 |1533.54 |10.43 |17060.76 | +|quantization |act_order|bits|group_size|kernel |prompt_length|new_tokens|Load time (s)|Per-token latency (ms)|Throughput (tok/s)|Max memory (MB)| +|-----|---------|----|-----------|----------|-------------|----------|-------------|----------------------|------------------|---------------| +|None|None |None|None |None |512 |1 |27.22 |1209.07 |13.23 |35871.88 | +| gptq | False | 4 | 128 | exllamav2 | 512 | 1 | 6.63 | 1467.36 | 10.90 | 18104.44 | +|gptq |False |4 |128 |exllama |512 |1 |38.35 |1280.25 |12.50 |17203.22 | +|gptq |False |4 |128 |autogptq-cuda-old |512 |1 |43.94 |1533.54 |10.43 |17060.76 | |bitsandbytes|None|None|None|None|512|1 |37.46|1256.88|12.73|17737.95| ## Perplexity benchmark results From a6235567ebbc3083975bd377e35ec657f433604b Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Tue, 24 Oct 2023 20:08:59 +0200 Subject: [PATCH 13/19] replace by use_exllama_v2 --- tests/benchmark/README.md | 21 ++++++++++----------- tests/benchmark/benchmark_gptq.py | 8 ++++---- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md index 02a44b7ee64..e6e778c3431 100644 --- a/tests/benchmark/README.md +++ b/tests/benchmark/README.md @@ -4,7 +4,7 @@ Please refer to https://medium.com/pytorch/bettertransformer-out-of-the-box-perf # GPTQ benchmark -The results below are for AutoGPTQ 0.4.2, PyTorch 2.0.1, bitsandbytes 0.41.1, transformers 4.32. +The results below are for AutoGPTQ 0.5.0, PyTorch 2.0.1, bitsandbytes 0.41.1, transformers 4.35. ## Generation benchmark results @@ -15,13 +15,13 @@ Run CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation --generate # GPTQ with exllamav2 kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --generate +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --use-exllama-v2 --generate # GPTQ with exllama kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --disable-exllamav2 --task text-generation --generate +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --generate # GPTQ without exllama kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --disable-exllama --disable-exllamav2 --generate +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --disable-exllama --generate # using bitsandbytes fp4/fp16 scheme CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation --bitsandbytes --generate @@ -94,14 +94,13 @@ Run CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --generate # GPTQ with exllamav2 kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --generate +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --use_exllama_v2 --generate # GPTQ with exllamav kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --disable-exllamav2 --sweep --num-batches 10 --gptq --task text-generation --prefill --generate - +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --generate # GPTQ without exllama kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --disable-exllama --disable-exllamav2 --generate +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --disable-exllama --generate # using bitsandbytes fp4/fp16 scheme CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --bitsandbytes --generate @@ -168,13 +167,13 @@ Run CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --task text-generation --ppl # GPTQ with exllamav2 kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --ppl +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --use_exllama_v2 --ppl # GPTQ with exllama kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --disable-exllamav2 --task text-generation --ppl +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --ppl # GPTQ without exllama kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --disable-exllama --disable-exllamav2 --ppl +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --disable-exllama --ppl # using bitsandbytes fp4/fp16 scheme CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf ---task text-generation --bitsandbytes --ppl diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py index 45fdc262cee..fc26e54c9bb 100644 --- a/tests/benchmark/benchmark_gptq.py +++ b/tests/benchmark/benchmark_gptq.py @@ -84,9 +84,9 @@ def get_parser(): help="Disable Exllama kernel, to rather use the AutoGPTQ CUDA (act-order case) or CUDA-old (no act-order case) kernels.", ) parser.add_argument( - "--disable-exllamav2", + "--use-exllama-v2", action="store_true", - help="Disable Exllama kernel, to rather use the AutoGPTQ CUDA (act-order case) or CUDA-old (no act-order case) kernels.", + help="Use Exllamav2 kernel. It will disable exllama kernels by default", ) parser.add_argument( "--generate", @@ -305,7 +305,7 @@ def benchmark_memory( load_start = time.time_ns() if args.gptq: quantization_config = GPTQConfig( - bits=4, disable_exllama=args.disable_exllama, disable_exllamav2=args.disable_exllamav2 + bits=4, disable_exllama=args.disable_exllama, use_exllama_v2=args.use_exllama_v2 ) model = autoclass.from_pretrained( args.model, @@ -338,7 +338,7 @@ def benchmark_memory( bits = quantization_config_dict["bits"] group_size = quantization_config_dict["group_size"] - if not args.disable_exllamav2: + if args.use_exllama_v2: kernel = "exllamav2" elif not args.disable_exllama: # Exllama kernel can handle both the act-order / no act-order cases. From 26d87e421b115513795f00e45177b44cd6fba1a9 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Fri, 27 Oct 2023 23:10:05 +0200 Subject: [PATCH 14/19] update benchmark arg --- tests/benchmark/README.md | 16 ++++++++-------- tests/benchmark/benchmark_gptq.py | 12 +++++------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md index e6e778c3431..0394b9cb3c3 100644 --- a/tests/benchmark/README.md +++ b/tests/benchmark/README.md @@ -18,10 +18,10 @@ CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-c CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --use-exllama-v2 --generate # GPTQ with exllama kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --generate +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --use-exllama --generate # GPTQ without exllama kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --disable-exllama --generate +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --generate # using bitsandbytes fp4/fp16 scheme CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation --bitsandbytes --generate @@ -94,13 +94,13 @@ Run CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --generate # GPTQ with exllamav2 kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --use_exllama_v2 --generate +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --use-exllama-v2 --generate # GPTQ with exllamav kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --generate +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --use-exllama --generate # GPTQ without exllama kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --disable-exllama --generate +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --generate # using bitsandbytes fp4/fp16 scheme CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --bitsandbytes --generate @@ -167,13 +167,13 @@ Run CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --task text-generation --ppl # GPTQ with exllamav2 kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --use_exllama_v2 --ppl +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --use-exllama-v2 --ppl # GPTQ with exllama kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --ppl +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --use-exllama --ppl # GPTQ without exllama kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --disable-exllama --ppl +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --ppl # using bitsandbytes fp4/fp16 scheme CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf ---task text-generation --bitsandbytes --ppl diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py index fc26e54c9bb..9f5f6d3dcee 100644 --- a/tests/benchmark/benchmark_gptq.py +++ b/tests/benchmark/benchmark_gptq.py @@ -79,9 +79,9 @@ def get_parser(): help="Use the parameter ranges for (batch_size, prompt_length, new_tokens) defined in the .py file instead of the CLI ones.", ) parser.add_argument( - "--disable-exllama", + "--use-exllama", action="store_true", - help="Disable Exllama kernel, to rather use the AutoGPTQ CUDA (act-order case) or CUDA-old (no act-order case) kernels.", + help="Use Exllama kernel, to rather use the AutoGPTQ CUDA (act-order case) or CUDA-old (no act-order case) kernels.", ) parser.add_argument( "--use-exllama-v2", @@ -236,7 +236,7 @@ def benchmark_memory( # I am not sure whether we should substract here `inactive_split_bytes.all.peak` (not sure what it corresponds to, though it can get quite large, in the several GB). peak_external_mb = peak_nvml_mb - peak_reserved_torch_mb - assert peak_external_mb > 0 + # assert peak_external_mb > 0 # This formula is to confirm. We measure the actual allocated PyTorch memory, plus the additional non-PyTorch memory (as the CUDA context, CUDA extension device memory). We need to substract the PyTorch peak reserved memory since this one appears in the peak nvidia-smi/nvmlDeviceGetMemoryInfo. @@ -304,9 +304,7 @@ def benchmark_memory( load_start = time.time_ns() if args.gptq: - quantization_config = GPTQConfig( - bits=4, disable_exllama=args.disable_exllama, use_exllama_v2=args.use_exllama_v2 - ) + quantization_config = GPTQConfig(bits=4, use_exllama=args.use_exllama, use_exllama_v2=args.use_exllama_v2) model = autoclass.from_pretrained( args.model, revision=args.revision, @@ -340,7 +338,7 @@ def benchmark_memory( if args.use_exllama_v2: kernel = "exllamav2" - elif not args.disable_exllama: + elif args.use_exllama: # Exllama kernel can handle both the act-order / no act-order cases. kernel = "exllama" elif act_order: From 4f797b114818fdf5f7c92a077100ba9733de1695 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 1 Nov 2023 14:36:38 +0100 Subject: [PATCH 15/19] switch to a config_dict instead of disable_exllamav2 --- .../usage_guides/quantization.mdx | 4 +- optimum/gptq/quantizer.py | 105 +++++++++++------- optimum/utils/import_utils.py | 2 +- tests/gptq/test_quantization.py | 18 ++- 4 files changed, 76 insertions(+), 53 deletions(-) diff --git a/docs/source/llm_quantization/usage_guides/quantization.mdx b/docs/source/llm_quantization/usage_guides/quantization.mdx index 52e842e9b16..0757464f22b 100644 --- a/docs/source/llm_quantization/usage_guides/quantization.mdx +++ b/docs/source/llm_quantization/usage_guides/quantization.mdx @@ -89,7 +89,7 @@ empty_model.tie_weights() quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto") ``` -If you wish to use exllama kernels, you will have to disable exllamav2 kernels: +If you wish to use exllama kernels, you will have to change the version by setting `exllama_config`: ```py from optimum.gptq import GPTQQuantizer, load_quantized_model @@ -99,7 +99,7 @@ from accelerate import init_empty_weights with init_empty_weights(): empty_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) empty_model.tie_weights() -quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", disable_exllamav2=True) +quantized_model = load_quantized_model(empty_model, save_folder=save_folder, device_map="auto", exllama_config = {"version":1}) ``` Note that only 4-bit models are supported with exllama/exllamav2 kernels for now. Furthermore, it is recommended to disable exllama/exllamav2 kernels when you are finetuning your model with peft. diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 103015642c6..ea32035da32 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -15,6 +15,7 @@ import copy import json import os +from enum import Enum from logging import getLogger from typing import Any, Dict, List, Optional, Tuple, Union @@ -49,6 +50,11 @@ logger = getLogger(__name__) +class ExllamaVersion(int, Enum): + ONE = 1 + TWO = 2 + + class GPTQQuantizer(object): r""" A simple API for GPTQ Quantization @@ -69,8 +75,8 @@ def __init__( module_name_preceding_first_block: Optional[List[str]] = None, batch_size: int = 1, pad_token_id: Optional[int] = None, - disable_exllama: Optional[bool] = None, - disable_exllamav2: bool = False, + disable_exllama: bool = False, + exllama_config: Dict[str, Any] = None, max_input_length: Optional[int] = None, cache_block_outputs: Optional[bool] = True, *args, @@ -109,10 +115,10 @@ def __init__( The batch size of the dataset pad_token_id (`Optional[int]`, defaults to `None`): The pad token id. Needed to prepare the dataset when `batch_size` > 1. - disable_exllama (`Optional[bool]`, defaults to `None`): + disable_exllama (`bool`, defaults to `False`): Whether to use exllama backend. Only works with `bits` = 4. - disable_exllamav2 (`bool`, defaults to `False`): - Whether to use exllamav2 backend. Only works with `bits` = 4. + exllama_config (`Dict[str, Any]`, *optional*): + The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset. max_input_length (`Optional[int]`, defaults to `None`): The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length. It is specific to the exllama backend with act-order. @@ -135,7 +141,7 @@ def __init__( self.batch_size = batch_size self.pad_token_id = pad_token_id self.disable_exllama = disable_exllama - self.disable_exllamav2 = disable_exllamav2 + self.exllama_config = exllama_config self.max_input_length = max_input_length self.quant_method = QuantizationMethod.GPTQ self.cache_block_outputs = cache_block_outputs @@ -146,16 +152,18 @@ def __init__( raise ValueError("group_size must be greater than 0 or equal to -1") if not (0 < self.damp_percent < 1): raise ValueError("damp_percent must between 0 and 1.") - if not self.disable_exllamav2 and not self.disable_exllama: - raise ValueError( - "disable_exllamav2 and disable_exllama are both set to `False`. Please disable one of the kernels." - ) - # If disable_exllamav2 is True, we want to fall back on the exllama kernel and not the cuda/cuda_old ones. - if self.disable_exllama is None: - if self.disable_exllamav2: - self.disable_exllama = False - else: - self.disable_exllama = True + + if self.exllama_config is None: + self.exllama_config = {"version": ExllamaVersion.TWO} + else: + if "version" not in self.exllama_config: + raise ValueError("`exllama_config` needs to have a `version` key") + elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]: + version = self.exllama_config["version"] + raise ValueError( + f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}" + ) + self.exllama_version = self.exllama_config["version"] def to_dict(self): """ @@ -223,8 +231,8 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st desc_act=self.desc_act, group_size=self.group_size, bits=self.bits, - disable_exllama=self.disable_exllama, - disable_exllamav2=self.disable_exllamav2, + disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE, + disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO, ) if isinstance(module, QuantLinear): return @@ -245,10 +253,18 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st out_features = layer.weight.shape[1] if not (self.desc_act) or self.group_size == -1: new_layer = QuantLinear( - self.bits, self.group_size, in_features, out_features, True, use_cuda_fp16=self.use_cuda_fp16, weight_dtype=layer.weight.dtype + self.bits, + self.group_size, + in_features, + out_features, + True, + use_cuda_fp16=self.use_cuda_fp16, + weight_dtype=layer.weight.dtype, ) else: - new_layer = QuantLinear(self.bits, self.group_size, in_features, out_features, True, weight_dtype=layer.weight.dtype) + new_layer = QuantLinear( + self.bits, self.group_size, in_features, out_features, True, weight_dtype=layer.weight.dtype + ) new_layer.device = device setattr(module, attr, new_layer.to(device)) for name1, child in module.named_children(): @@ -483,22 +499,22 @@ def tmp(_, input, output): if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])): if not self.disable_exllama: logger.warning( - "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`" + "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`" ) self.disable_exllama = True # act order and exllama - elif self.desc_act and not self.disable_exllama: + elif self.desc_act and not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE: logger.warning( "Using Exllama backend with act_order will reorder the weights offline, thus you will not be able to save the model with the right weights." "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. " ) self.disable_exllama = True - elif not self.disable_exllamav2: + elif not self.disable_exllama and self.exllama_version == ExllamaVersion.TWO: logger.warning( "Using Exllamav2 backend will reorder the weights offline, thus you will not be able to save the model with the right weights." - "Setting `disable_exllamav2=True`. You should only use Exllamav2 backend for inference. " + "Setting `disable_exllama=True`. You should only use Exllamav2 backend for inference. " ) - self.disable_exllamav2 = True + self.disable_exllama = True # Step 4: Pack the model at the end (Replacing the layers) self.pack_model(model=model, quantizers=quantizers) @@ -522,13 +538,13 @@ def post_init_model(self, model): model (`nn.Module`): The input model """ - if self.bits == 4 and (not self.disable_exllama or not self.disable_exllamav2): + if self.bits == 4 and not self.disable_exllama: if get_device(model) == torch.device("cpu") or ( hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"]) ): raise ValueError( "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU." - "You can deactivate exllama backend by setting `disable_exllama=True` or `disable_exllamav2=True` in the quantization config object" + "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object" ) class StoreAttr(object): @@ -537,7 +553,11 @@ class StoreAttr(object): model.quantize_config = StoreAttr() model.quantize_config.desc_act = self.desc_act model = autogptq_post_init(model, use_act_order=self.desc_act) - if self.desc_act and not self.disable_exllama and self.max_input_length is not None: + if ( + self.desc_act + and (not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE) + and self.max_input_length is not None + ): model = exllama_set_max_input_length(model, self.max_input_length) return model @@ -560,8 +580,8 @@ def pack_model( desc_act=self.desc_act, group_size=self.group_size, bits=self.bits, - disable_exllama=self.disable_exllama, - disable_exllamav2=self.disable_exllamav2, + disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE, + disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO, ) logger.info("Packing model...") layers = get_layers(model) @@ -627,8 +647,8 @@ def load_quantized_model( offload_folder: Optional[str] = None, offload_buffers: Optional[str] = None, offload_state_dict: bool = False, - disable_exllama: Optional[bool] = None, - disable_exllamav2: bool = False, + disable_exllama: bool = False, + exllama_config: Dict[str, Any] = None, max_input_length: Optional[int] = None, ): """ @@ -664,8 +684,8 @@ def load_quantized_model( picked contains `"disk"` values. disable_exllama (`Optional[bool]`, defaults to `None`): Whether to use exllama backend. Only works with `bits` = 4. - disable_exllama (`bool`, defaults to `False`): - Whether to use exllamav2 backend. Only works with `bits` = 4. + exllama_config (`Dict[str, Any]`, *optional*): + The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset. max_input_length (`Optional[int]`, defaults to `None`): The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length. It is specific to the exllama backend with act-order. @@ -686,11 +706,16 @@ def load_quantized_model( device_map = {"": torch.cuda.current_device()} logger.info("The device_map was not initialized." "Setting device_map to `{'':torch.cuda.current_device()}`.") - if disable_exllama is None: - if disable_exllamav2: - disable_exllama = False - else: - disable_exllama = True + if exllama_config is None: + exllama_config = {"version": ExllamaVersion.TWO} + else: + if "version" not in exllama_config: + raise ValueError("`exllama_config` needs to have a `version` key") + elif exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]: + version = exllama_config["version"] + raise ValueError( + f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {version}" + ) # this branch will check if model is from huggingface try: @@ -705,7 +730,7 @@ def load_quantized_model( ) from err quantizer = GPTQQuantizer.from_dict(quantize_config_dict) quantizer.disable_exllama = disable_exllama - quantizer.disable_exllamav2 = disable_exllamav2 + quantizer.exllama_config = exllama_config quantizer.max_input_length = max_input_length model = quantizer.convert_model(model) diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index f262548975b..d1729d85991 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -35,7 +35,7 @@ TORCH_MINIMUM_VERSION = packaging.version.parse("1.11.0") TRANSFORMERS_MINIMUM_VERSION = packaging.version.parse("4.25.0") DIFFUSERS_MINIMUM_VERSION = packaging.version.parse("0.18.0") -AUTOGPTQ_MINIMUM_VERSION = packaging.version.parse("0.5.0") +AUTOGPTQ_MINIMUM_VERSION = packaging.version.parse("0.4.2") # This is the minimal required version to support some ONNX Runtime features diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py index e5811f91a87..2ebec3f4aad 100644 --- a/tests/gptq/test_quantization.py +++ b/tests/gptq/test_quantization.py @@ -46,7 +46,7 @@ class GPTQTest(unittest.TestCase): group_size = 128 desc_act = False disable_exllama = True - disable_exllamav2 = True + exllama_config = None cache_block_outputs = True dataset = [ @@ -71,7 +71,7 @@ def setUpClass(cls): group_size=cls.group_size, desc_act=cls.desc_act, disable_exllama=cls.disable_exllama, - disable_exllamav2=cls.disable_exllamav2, + exllama_config=cls.exllama_config, cache_block_outputs=cls.cache_block_outputs, ) @@ -99,8 +99,8 @@ def test_quantized_layers_class(self): desc_act=self.desc_act, group_size=self.group_size, bits=self.bits, - disable_exllama=self.disable_exllama, - disable_exllamav2=self.disable_exllamav2, + disable_exllama=self.disable_exllama or self.exllama_config["version"] != 1, + disable_exllamav2=self.disable_exllama or self.exllama_config["version"] != 2, ) self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear) @@ -142,14 +142,14 @@ def test_serialization(self): save_folder=tmpdirname, device_map={"": 0}, disable_exllama=self.disable_exllama, - disable_exllamav2=self.disable_exllamav2, + exllama_config=self.exllama_config, ) self.check_inference_correctness(quantized_model_from_saved) class GPTQTestExllama(GPTQTest): disable_exllama = False - disable_exllamav2 = True + exllama_config = {"version": 1} EXPECTED_OUTPUTS = set() EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I") EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.") @@ -163,7 +163,6 @@ class GPTQTestActOrder(GPTQTest): EXPECTED_OUTPUTS.add("Hello my name is nathalie, I am a young girl from") disable_exllama = True - disable_exllamav2 = True desc_act = True def test_generate_quality(self): @@ -189,7 +188,7 @@ def test_exllama_serialization(self): ) empty_model.tie_weights() quantized_model_from_saved = load_quantized_model( - empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllamav2=True + empty_model, save_folder=tmpdirname, device_map={"": 0}, exllama_config={"version": 1} ) self.check_inference_correctness(quantized_model_from_saved) @@ -212,7 +211,7 @@ def test_exllama_max_input_length(self): save_folder=tmpdirname, device_map={"": 0}, max_input_length=4028, - disable_exllamav2=True, + exllama_config={"version": 1}, ) prompt = "I am in Paris and" * 1000 @@ -231,7 +230,6 @@ def test_exllama_max_input_length(self): class GPTQTestExllamav2(GPTQTest): desc_act = False disable_exllama = True - disable_exllamav2 = True def test_generate_quality(self): # don't need to test From 1d845c79e73aeec71a6bbdbd43e43bad6ee4fc2b Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Wed, 1 Nov 2023 11:23:53 -0400 Subject: [PATCH 16/19] Apply suggestions from code review Co-authored-by: fxmarty <9808326+fxmarty@users.noreply.github.com> --- optimum/gptq/quantizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index ea32035da32..56eca366051 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -648,7 +648,7 @@ def load_quantized_model( offload_buffers: Optional[str] = None, offload_state_dict: bool = False, disable_exllama: bool = False, - exllama_config: Dict[str, Any] = None, + exllama_config: Optional[Dict[str, Any]] = None, max_input_length: Optional[int] = None, ): """ @@ -684,7 +684,7 @@ def load_quantized_model( picked contains `"disk"` values. disable_exllama (`Optional[bool]`, defaults to `None`): Whether to use exllama backend. Only works with `bits` = 4. - exllama_config (`Dict[str, Any]`, *optional*): + exllama_config (`Optional[Dict[str, Any]]`, defaults to `None`): The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults to `{"version": 2}` if unset. max_input_length (`Optional[int]`, defaults to `None`): The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input length. From d5b298fdd04cde020203ee28faf6e7fa12a7fd01 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 1 Nov 2023 16:24:43 +0100 Subject: [PATCH 17/19] better tests --- optimum/gptq/quantizer.py | 1 + tests/gptq/test_quantization.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index ea32035da32..e209c3d5bd8 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -731,6 +731,7 @@ def load_quantized_model( quantizer = GPTQQuantizer.from_dict(quantize_config_dict) quantizer.disable_exllama = disable_exllama quantizer.exllama_config = exllama_config + quantizer.exllama_version = quantizer.exllama_config["version"] quantizer.max_input_length = max_input_length model = quantizer.convert_model(model) diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py index 2ebec3f4aad..de2acddf6f2 100644 --- a/tests/gptq/test_quantization.py +++ b/tests/gptq/test_quantization.py @@ -104,6 +104,9 @@ def test_quantized_layers_class(self): ) self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear) + def check_quantized_layers_type(self, model, value): + self.assertTrue(model.transformer.h[0].mlp.dense_4h_to_h.QUANT_TYPE == value) + def check_inference_correctness(self, model): """ Test the generation quality of the quantized model and see that we are matching the expected output. @@ -118,6 +121,7 @@ def check_inference_correctness(self, model): # Check the exactness of the result self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) + def test_generate_quality(self): self.check_inference_correctness(self.quantized_model) @@ -144,6 +148,7 @@ def test_serialization(self): disable_exllama=self.disable_exllama, exllama_config=self.exllama_config, ) + self.check_quantized_layers_type(quantized_model_from_saved,"cuda-old") self.check_inference_correctness(quantized_model_from_saved) @@ -190,6 +195,7 @@ def test_exllama_serialization(self): quantized_model_from_saved = load_quantized_model( empty_model, save_folder=tmpdirname, device_map={"": 0}, exllama_config={"version": 1} ) + self.check_quantized_layers_type(quantized_model_from_saved,"exllama") self.check_inference_correctness(quantized_model_from_saved) def test_exllama_max_input_length(self): @@ -213,6 +219,7 @@ def test_exllama_max_input_length(self): max_input_length=4028, exllama_config={"version": 1}, ) + self.check_quantized_layers_type(quantized_model_from_saved,"exllama") prompt = "I am in Paris and" * 1000 inp = self.tokenizer(prompt, return_tensors="pt").to(0) @@ -258,6 +265,7 @@ def test_exllama_serialization(self): save_folder=tmpdirname, device_map={"": 0}, ) + self.check_quantized_layers_type(quantized_model_from_saved,"exllamav2") self.check_inference_correctness(quantized_model_from_saved) From c21601d8ec1000968775acc7c50c130dce0c5636 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 1 Nov 2023 16:26:30 +0100 Subject: [PATCH 18/19] style --- tests/gptq/test_quantization.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py index de2acddf6f2..7f50a57496a 100644 --- a/tests/gptq/test_quantization.py +++ b/tests/gptq/test_quantization.py @@ -121,7 +121,6 @@ def check_inference_correctness(self, model): # Check the exactness of the result self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) - def test_generate_quality(self): self.check_inference_correctness(self.quantized_model) @@ -148,7 +147,7 @@ def test_serialization(self): disable_exllama=self.disable_exllama, exllama_config=self.exllama_config, ) - self.check_quantized_layers_type(quantized_model_from_saved,"cuda-old") + self.check_quantized_layers_type(quantized_model_from_saved, "cuda-old") self.check_inference_correctness(quantized_model_from_saved) @@ -195,7 +194,7 @@ def test_exllama_serialization(self): quantized_model_from_saved = load_quantized_model( empty_model, save_folder=tmpdirname, device_map={"": 0}, exllama_config={"version": 1} ) - self.check_quantized_layers_type(quantized_model_from_saved,"exllama") + self.check_quantized_layers_type(quantized_model_from_saved, "exllama") self.check_inference_correctness(quantized_model_from_saved) def test_exllama_max_input_length(self): @@ -219,7 +218,7 @@ def test_exllama_max_input_length(self): max_input_length=4028, exllama_config={"version": 1}, ) - self.check_quantized_layers_type(quantized_model_from_saved,"exllama") + self.check_quantized_layers_type(quantized_model_from_saved, "exllama") prompt = "I am in Paris and" * 1000 inp = self.tokenizer(prompt, return_tensors="pt").to(0) @@ -265,7 +264,7 @@ def test_exllama_serialization(self): save_folder=tmpdirname, device_map={"": 0}, ) - self.check_quantized_layers_type(quantized_model_from_saved,"exllamav2") + self.check_quantized_layers_type(quantized_model_from_saved, "exllamav2") self.check_inference_correctness(quantized_model_from_saved) From 11e71e260c7c8f7e6773da1f473d90b9836e9f89 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 1 Nov 2023 17:46:11 +0100 Subject: [PATCH 19/19] style --- tests/benchmark/README.md | 6 +++--- tests/benchmark/benchmark_gptq.py | 23 ++++++++++++++--------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md index 0394b9cb3c3..b6d6f9cddad 100644 --- a/tests/benchmark/README.md +++ b/tests/benchmark/README.md @@ -15,7 +15,7 @@ Run CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 4 --task text-generation --generate # GPTQ with exllamav2 kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --use-exllama-v2 --generate +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --use-exllama --exllama-version 2 --generate # GPTQ with exllama kernel (int4/fp16) CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 4 --gptq --task text-generation --use-exllama --generate @@ -94,7 +94,7 @@ Run CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --sweep --num-batches 10 --task text-generation --prefill --generate # GPTQ with exllamav2 kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --use-exllama-v2 --generate +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --use-exllama --exllama-version 2 --generate # GPTQ with exllamav kernel (int4/fp16) CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --sweep --num-batches 10 --gptq --task text-generation --prefill --use-exllama --generate @@ -167,7 +167,7 @@ Run CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model meta-llama/Llama-2-13b-chat-hf --task text-generation --ppl # GPTQ with exllamav2 kernel (int4/fp16) -CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --use-exllama-v2 --ppl +CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --use-exllama --exllama-version 2 --ppl # GPTQ with exllama kernel (int4/fp16) CUDA_VISIBLE_DEVICES=0 python benchmark_gptq.py --model TheBloke/Llama-2-13B-chat-GPTQ --revision gptq-4bit-128g-actorder_True --gptq --task text-generation --use-exllama --ppl diff --git a/tests/benchmark/benchmark_gptq.py b/tests/benchmark/benchmark_gptq.py index 9f5f6d3dcee..29f986015a4 100644 --- a/tests/benchmark/benchmark_gptq.py +++ b/tests/benchmark/benchmark_gptq.py @@ -84,9 +84,10 @@ def get_parser(): help="Use Exllama kernel, to rather use the AutoGPTQ CUDA (act-order case) or CUDA-old (no act-order case) kernels.", ) parser.add_argument( - "--use-exllama-v2", - action="store_true", - help="Use Exllamav2 kernel. It will disable exllama kernels by default", + "--exllama-version", + type=int, + default=2, + help="Use Exllamav2 kernel. Set 1 in order to use exllama kernel", ) parser.add_argument( "--generate", @@ -304,7 +305,9 @@ def benchmark_memory( load_start = time.time_ns() if args.gptq: - quantization_config = GPTQConfig(bits=4, use_exllama=args.use_exllama, use_exllama_v2=args.use_exllama_v2) + quantization_config = GPTQConfig( + bits=4, use_exllama=args.use_exllama, exllama_config={"version": args.exllama_version} + ) model = autoclass.from_pretrained( args.model, revision=args.revision, @@ -335,12 +338,14 @@ def benchmark_memory( act_order = quantization_config_dict["desc_act"] bits = quantization_config_dict["bits"] group_size = quantization_config_dict["group_size"] + use_exllama = quantization_config_dict["use_exllama"] + exllama_version = quantization_config_dict["exllama_config"]["version"] - if args.use_exllama_v2: - kernel = "exllamav2" - elif args.use_exllama: - # Exllama kernel can handle both the act-order / no act-order cases. - kernel = "exllama" + if use_exllama: + if exllama_version == 2: + kernel = "exllamav2" + else: + kernel = "exllama" elif act_order: kernel = "autotogptq-cuda" else: