From e604af32fcd054cdeafcfb5553d02e92e0787fd3 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david.corvoysier@gmail.com>
Date: Mon, 9 Sep 2024 17:36:05 +0200
Subject: [PATCH] Add quanto install and instructions (#1976)

* chore: add quanto install option

* docs: add quanto to README

* Apply suggestions from code review

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

---------

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 README.md | 31 +++++++++++++++++++++++++++++++
 setup.py  |  1 +
 2 files changed, 32 insertions(+)

diff --git a/README.md b/README.md
index 9a6403cdacb..9a81e69e126 100644
--- a/README.md
+++ b/README.md
@@ -268,3 +268,34 @@ You can find more examples in the [documentation](https://huggingface.co/docs/op
 ```
 
 You can find more examples in the [documentation](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/trainer) and in the [examples](https://github.com/huggingface/optimum/tree/main/examples/onnxruntime/training).
+
+
+### Quanto
+
+[Quanto](https://github.com/huggingface/optimum-quanto) is a pytorch quantization backend.
+
+You can quantize a model either using the python API or the `optimum-cli`.
+
+```python
+from transformers import AutoModelForCausalLM
+from optimum.quanto import QuantizedModelForCausalLM, qint4
+
+model = AutoModelForCausalLM.from_pretrained('meta-llama/Meta-Llama-3.1-8B')
+qmodel = QuantizedModelForCausalLM.quantize(model, weights=qint4, exclude='lm_head')
+```
+
+The quantized model can be saved using `save_pretrained`:
+
+```python
+qmodel.save_pretrained('./Llama-3.1-8B-quantized')
+```
+
+It can later be reloaded using `from_pretrained`:
+
+```python
+from optimum.quanto import QuantizedModelForCausalLM
+
+qmodel = QuantizedModelForCausalLM.from_pretrained('Llama-3.1-8B-quantized')
+```
+
+You can see more details and [examples](https://github.com/huggingface/optimum-quanto/tree/main/examples) in the [Quanto](https://github.com/huggingface/optimum-quanto) repository.
diff --git a/setup.py b/setup.py
index 98ee4f36a3f..ac5db71a74b 100644
--- a/setup.py
+++ b/setup.py
@@ -88,6 +88,7 @@
     "graphcore": "optimum-graphcore",
     "furiosa": "optimum-furiosa",
     "amd": "optimum-amd",
+    "quanto": ["optimum-quanto>=0.2.4"],
     "dev": TESTS_REQUIRE + QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,
     "quality": QUALITY_REQUIRE,