ModelCloud · jiqing-feng · Nov 29, 2024 · Dec 2, 2024 · Dec 2, 2024 · Dec 2, 2024
diff --git a/examples/inference/run_transformers.py b/examples/inference/run_transformers.py
@@ -0,0 +1,6 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
+quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
+print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0]))
+
diff --git a/examples/quantization/transformers_usage.py b/examples/quantization/transformers_usage.py
@@ -0,0 +1,13 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+
+model_id = "facebook/opt-125m"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
+gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu", quantization_config=gptq_config)
+quantized_model.save_pretrained("./opt-125m-gptq")
+tokenizer.save_pretrained("./opt-125m-gptq")
+
+model = AutoModelForCausalLM.from_pretrained("./opt-125m-gptq", device_map="auto")
+
+print(tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0]))
diff --git a/gptqmodel/utils/importer.py b/gptqmodel/utils/importer.py
@@ -34,38 +34,63 @@
 }
 
 
+def hf_select_quant_linear(
+        bits: int,
+        group_size: int,
+        desc_act: bool,
+        sym: bool,
+        backend: BACKEND = BACKEND.AUTO,
+        format: FORMAT = FORMAT.GPTQ,
+        pack: bool = False,
+        dynamic=None,
+):
+    return select_quant_linear(
+        bits=bits,
+        group_size=group_size,
+        desc_act=desc_act,
+        sym=sym,
+        backend=backend,
+        format=format,
+        pack=pack,
+        dynamic=dynamic,
+    )
+
+
 # auto select the correct/optimal QuantLinear class
 def select_quant_linear(
         bits: int,
         group_size: int,
         desc_act: bool,
         sym: bool,
-        backend: BACKEND,
-        format: FORMAT,
+        backend: BACKEND = BACKEND.AUTO,
+        format: FORMAT = FORMAT.GPTQ,
         pack: bool = False,
         dynamic=None,
 ):
     # Handle the case where backend is AUTO.
     if backend == BACKEND.AUTO:
-        allow_backends = format_dict[format]
-        err = None
-        for k, values in backend_dict.items():
+        if not torch.cuda.is_available():
+            backend = BACKEND.IPEX
+        else:
+            allow_backends = format_dict[format]
+            err = None
+            for k, values in backend_dict.items():
 
-            for v in values:
-                in_allow_backends = k in allow_backends
-                validate, err = v.validate(bits, group_size, desc_act, sym, dynamic=dynamic)
-                if in_allow_backends and validate:
-                    if pack:
-                        check_pack_func = hasattr(v, "pack")
-                        if check_pack_func:
+                for v in values:
+                    in_allow_backends = k in allow_backends
+                    validate, err = v.validate(bits, group_size, desc_act, sym, dynamic=dynamic)
+                    if in_allow_backends and validate:
+                        if pack:
+                            check_pack_func = hasattr(v, "pack")
+                            if check_pack_func:
+                                logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}")
+                                return v
+                        else:
                             logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}")
                             return v
-                    else:
-                        logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}")
-                        return v
 
-        if err:
-            raise err
+            if err:
+                raise err
 
     # Handle the case where backend is not AUTO.
     if backend == BACKEND.TRITON:

diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py
@@ -21,6 +21,7 @@
 from transformers.utils.hub import cached_file
 
 from .backend import BACKEND
+from .exllama import exllama_set_max_input_length
 from .importer import select_quant_linear
 from .logger import setup_logger
 from .progress import ProgressBar
@@ -536,6 +537,9 @@ def gptqmodel_post_init(model, use_act_order: bool, quantize_config: QuantizeCon
 
     torch.cuda.empty_cache()
 
+    # if use_act_order and max_input_length and isinstance(submodule, ExllamaQuantLinear):
+    #     model = exllama_set_max_input_length(model, max_input_length)
+
     return model