[bitsandbytes] Add bitsandbytes doc

vllm-project · Jul 14, 2024 · 8b89293 · 8b89293
1 parent 3377572
commit 8b89293
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 1 deletion.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -102,6 +102,7 @@ Documentation
 
    quantization/supported_hardware
    quantization/auto_awq
+   quantization/bnb
    quantization/fp8
    quantization/fp8_e5m2_kvcache
    quantization/fp8_e4m3_kvcache

diff --git a/docs/source/quantization/bnb.rst b/docs/source/quantization/bnb.rst
@@ -0,0 +1,41 @@
+.. _bits_and_bytes:
+
+BitsAndBytes
+==================
+
+vLLM now supports `BitsAndBytes <https://github.com/TimDettmers/bitsandbytes>`_ for more efficient model inference.
+BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
+This is particularly useful for deploying large language models in resource-constrained environments.
+Below are the steps to utilize BitsAndBytes with vLLM.
+
+.. code-block:: console
+
+    $ pip install bitsandbytes>=0.42.0
+
+vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
+
+Read quantized checkpoint
+--------------------------
+
+.. code-block:: python
+
+    from vllm import LLM, SamplingParams
+    import torch
+    import time
+    #unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
+    model_id = "unsloth/tinyllama-bnb-4bit"
+    llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
+    quantization="bitsandbytes", load_format="bitsandbytes")
+
+Inflight quantization: load as 4bit quantization
+------------------------------------------------
+
+.. code-block:: python
+
+    from vllm import LLM, SamplingParams
+    import torch
+    import time
+    model_id = "huggyllama/llama-7b"
+    llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
+    quantization="bitsandbytes", load_format="bitsandbytes")
+
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -19,7 +19,7 @@ def __init__(self, ) -> None:
         pass
 
     def __repr__(self) -> str:
-        return ("BitsAndBytesConfig")
+        return "BitsAndBytesConfig"
 
     @classmethod
     def get_name(self) -> str: