diff --git a/examples/inference/run_transformers.py b/examples/inference/run_transformers.py new file mode 100644 index 000000000..348515d3a --- /dev/null +++ b/examples/inference/run_transformers.py @@ -0,0 +1,6 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ") +quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ") +print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0])) + diff --git a/examples/quantization/transformers_usage.py b/examples/quantization/transformers_usage.py new file mode 100755 index 000000000..565c074a2 --- /dev/null +++ b/examples/quantization/transformers_usage.py @@ -0,0 +1,13 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig + +model_id = "facebook/opt-125m" +tokenizer = AutoTokenizer.from_pretrained(model_id) +dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."] +gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer) +quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu", quantization_config=gptq_config) +quantized_model.save_pretrained("./opt-125m-gptq") +tokenizer.save_pretrained("./opt-125m-gptq") + +model = AutoModelForCausalLM.from_pretrained("./opt-125m-gptq", device_map="auto") + +print(tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0])) \ No newline at end of file diff --git a/gptqmodel/utils/importer.py b/gptqmodel/utils/importer.py index 0f8ea85af..5c136503f 100644 --- a/gptqmodel/utils/importer.py +++ b/gptqmodel/utils/importer.py @@ -34,38 +34,63 @@ } +def hf_select_quant_linear( + bits: int, + group_size: int, + desc_act: bool, + sym: bool, + backend: BACKEND = BACKEND.AUTO, + format: FORMAT = FORMAT.GPTQ, + pack: bool = False, + dynamic=None, +): + return select_quant_linear( + bits=bits, + group_size=group_size, + desc_act=desc_act, + sym=sym, + backend=backend, + format=format, + pack=pack, + dynamic=dynamic, + ) + + # auto select the correct/optimal QuantLinear class def select_quant_linear( bits: int, group_size: int, desc_act: bool, sym: bool, - backend: BACKEND, - format: FORMAT, + backend: BACKEND = BACKEND.AUTO, + format: FORMAT = FORMAT.GPTQ, pack: bool = False, dynamic=None, ): # Handle the case where backend is AUTO. if backend == BACKEND.AUTO: - allow_backends = format_dict[format] - err = None - for k, values in backend_dict.items(): + if not torch.cuda.is_available(): + backend = BACKEND.IPEX + else: + allow_backends = format_dict[format] + err = None + for k, values in backend_dict.items(): - for v in values: - in_allow_backends = k in allow_backends - validate, err = v.validate(bits, group_size, desc_act, sym, dynamic=dynamic) - if in_allow_backends and validate: - if pack: - check_pack_func = hasattr(v, "pack") - if check_pack_func: + for v in values: + in_allow_backends = k in allow_backends + validate, err = v.validate(bits, group_size, desc_act, sym, dynamic=dynamic) + if in_allow_backends and validate: + if pack: + check_pack_func = hasattr(v, "pack") + if check_pack_func: + logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}") + return v + else: logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}") return v - else: - logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}") - return v - if err: - raise err + if err: + raise err # Handle the case where backend is not AUTO. if backend == BACKEND.TRITON: diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py index 3bea7eb13..c36a23135 100644 --- a/gptqmodel/utils/model.py +++ b/gptqmodel/utils/model.py @@ -21,6 +21,7 @@ from transformers.utils.hub import cached_file from .backend import BACKEND +from .exllama import exllama_set_max_input_length from .importer import select_quant_linear from .logger import setup_logger from .progress import ProgressBar @@ -536,6 +537,9 @@ def gptqmodel_post_init(model, use_act_order: bool, quantize_config: QuantizeCon torch.cuda.empty_cache() + # if use_act_order and max_input_length and isinstance(submodule, ExllamaQuantLinear): + # model = exllama_set_max_input_length(model, max_input_length) + return model