From e604af32fcd054cdeafcfb5553d02e92e0787fd3 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Mon, 9 Sep 2024 17:36:05 +0200 Subject: [PATCH] Add quanto install and instructions (#1976) * chore: add quanto install option * docs: add quanto to README * Apply suggestions from code review Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- README.md | 31 +++++++++++++++++++++++++++++++ setup.py | 1 + 2 files changed, 32 insertions(+) diff --git a/README.md b/README.md index 9a6403cdacb..9a81e69e126 100644 --- a/README.md +++ b/README.md @@ -268,3 +268,34 @@ You can find more examples in the [documentation](https://huggingface.co/docs/op ``` You can find more examples in the [documentation](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/trainer) and in the [examples](https://github.com/huggingface/optimum/tree/main/examples/onnxruntime/training). + + +### Quanto + +[Quanto](https://github.com/huggingface/optimum-quanto) is a pytorch quantization backend. + +You can quantize a model either using the python API or the `optimum-cli`. + +```python +from transformers import AutoModelForCausalLM +from optimum.quanto import QuantizedModelForCausalLM, qint4 + +model = AutoModelForCausalLM.from_pretrained('meta-llama/Meta-Llama-3.1-8B') +qmodel = QuantizedModelForCausalLM.quantize(model, weights=qint4, exclude='lm_head') +``` + +The quantized model can be saved using `save_pretrained`: + +```python +qmodel.save_pretrained('./Llama-3.1-8B-quantized') +``` + +It can later be reloaded using `from_pretrained`: + +```python +from optimum.quanto import QuantizedModelForCausalLM + +qmodel = QuantizedModelForCausalLM.from_pretrained('Llama-3.1-8B-quantized') +``` + +You can see more details and [examples](https://github.com/huggingface/optimum-quanto/tree/main/examples) in the [Quanto](https://github.com/huggingface/optimum-quanto) repository. diff --git a/setup.py b/setup.py index 98ee4f36a3f..ac5db71a74b 100644 --- a/setup.py +++ b/setup.py @@ -88,6 +88,7 @@ "graphcore": "optimum-graphcore", "furiosa": "optimum-furiosa", "amd": "optimum-amd", + "quanto": ["optimum-quanto>=0.2.4"], "dev": TESTS_REQUIRE + QUALITY_REQUIRE, "tests": TESTS_REQUIRE, "quality": QUALITY_REQUIRE,