Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

transformers #713

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions examples/inference/run_transformers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0]))

13 changes: 13 additions & 0 deletions examples/quantization/transformers_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

model_id = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer)
quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu", quantization_config=gptq_config)
quantized_model.save_pretrained("./opt-125m-gptq")
tokenizer.save_pretrained("./opt-125m-gptq")

model = AutoModelForCausalLM.from_pretrained("./opt-125m-gptq", device_map="auto")

print(tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0]))
59 changes: 42 additions & 17 deletions gptqmodel/utils/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,38 +34,63 @@
}


def hf_select_quant_linear(
bits: int,
group_size: int,
desc_act: bool,
sym: bool,
backend: BACKEND = BACKEND.AUTO,
format: FORMAT = FORMAT.GPTQ,
pack: bool = False,
dynamic=None,
):
return select_quant_linear(
bits=bits,
group_size=group_size,
desc_act=desc_act,
sym=sym,
backend=backend,
format=format,
pack=pack,
dynamic=dynamic,
)


# auto select the correct/optimal QuantLinear class
def select_quant_linear(
bits: int,
group_size: int,
desc_act: bool,
sym: bool,
backend: BACKEND,
format: FORMAT,
backend: BACKEND = BACKEND.AUTO,
format: FORMAT = FORMAT.GPTQ,
pack: bool = False,
dynamic=None,
):
# Handle the case where backend is AUTO.
if backend == BACKEND.AUTO:
allow_backends = format_dict[format]
err = None
for k, values in backend_dict.items():
if not torch.cuda.is_available():
backend = BACKEND.IPEX
else:
allow_backends = format_dict[format]
err = None
for k, values in backend_dict.items():

for v in values:
in_allow_backends = k in allow_backends
validate, err = v.validate(bits, group_size, desc_act, sym, dynamic=dynamic)
if in_allow_backends and validate:
if pack:
check_pack_func = hasattr(v, "pack")
if check_pack_func:
for v in values:
in_allow_backends = k in allow_backends
validate, err = v.validate(bits, group_size, desc_act, sym, dynamic=dynamic)
if in_allow_backends and validate:
if pack:
check_pack_func = hasattr(v, "pack")
if check_pack_func:
logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}")
return v
else:
logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}")
return v
else:
logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}")
return v

if err:
raise err
if err:
raise err

# Handle the case where backend is not AUTO.
if backend == BACKEND.TRITON:
Expand Down
4 changes: 4 additions & 0 deletions gptqmodel/utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from transformers.utils.hub import cached_file

from .backend import BACKEND
from .exllama import exllama_set_max_input_length
from .importer import select_quant_linear
from .logger import setup_logger
from .progress import ProgressBar
Expand Down Expand Up @@ -536,6 +537,9 @@ def gptqmodel_post_init(model, use_act_order: bool, quantize_config: QuantizeCon

torch.cuda.empty_cache()

# if use_act_order and max_input_length and isinstance(submodule, ExllamaQuantLinear):
# model = exllama_set_max_input_length(model, max_input_length)

return model


Expand Down