predibase · tgaddair · Jan 22, 2024 · Dec 22, 2023 · Dec 22, 2023 · Dec 22, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -37,7 +37,7 @@ RUN cargo build --release
 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
 FROM debian:bullseye-slim as pytorch-install
 
-ARG PYTORCH_VERSION=2.1.1
+ARG PYTORCH_VERSION=2.1.2
 ARG PYTHON_VERSION=3.10
 ARG CUDA_VERSION=11.8
 ARG MAMBA_VERSION=23.1.0-1

diff --git a/launcher/src/main.rs b/launcher/src/main.rs
@@ -26,6 +26,9 @@ enum Quantization {
     BitsandbytesFP4,
     Gptq,
     Awq,
+    Hqq_4bit,
+    Hqq_3bit,
+    Hqq_2bit,
 }
 
 impl std::fmt::Display for Quantization {
@@ -47,6 +50,15 @@ impl std::fmt::Display for Quantization {
             Quantization::Awq => {
                 write!(f, "awq")
             }
+            Quantization::Hqq_4bit => {
+                write!(f, "hqq-4bit")
+            }
+            Quantization::Hqq_3bit => {
+                write!(f, "hqq-3bit")
+            }
+            Quantization::Hqq_2bit => {
+                write!(f, "hqq-2bit")
+            }
         }
     }
 }

diff --git a/server/lorax_server/cli.py b/server/lorax_server/cli.py
@@ -17,6 +17,9 @@ class Quantization(str, Enum):
     bitsandbytes_fp4 = "bitsandbytes-fp4"
     gptq = "gptq"
     awq = "awq"
+    hqq_4bit = "hqq-4bit"
+    hqq_3bit = "hqq-3bit"
+    hqq_2bit = "hqq-2bit"
 
 
 class Dtype(str, Enum):

diff --git a/server/lorax_server/utils/layers.py b/server/lorax_server/utils/layers.py
@@ -21,6 +21,17 @@
 except ImportError:
     HAS_AWQ = False
 
+HAS_HQQ = True
+try:
+    from hqq.core.quantize import BaseQuantizeConfig, HQQLinear
+
+    class HQQLinearLayer(HQQLinear):
+        @property
+        def weight(self) -> torch.Tensor:
+            return self.W_q
+except ImportError:
+    HAS_HQQ = False
+
 from accelerate import init_empty_weights
 
 from lorax_server.utils.gptq.quant_linear import QuantLinear
@@ -271,6 +282,22 @@ def get_linear(weight, bias, quantize, fan_in_fan_out=False):
                 f"The passed weight is not compatible with `awq`"
             )
         linear = AWQLinear(w_bit=bits, group_size=groupsize, qweight=qweight, qzeros=qzeros, scales=scales, bias=bias is not None)
+    elif "hqq-" in quantize:
+        if quantize == "hqq-4bit":
+            quant_config = BaseQuantizeConfig(nbits=4, group_size=64, quant_zero=True, quant_scale=False)
+        elif quantize == "hqq-3bit":
+            quant_config = BaseQuantizeConfig(nbits=3, group_size=64, quant_zero=True, quant_scale=False)
+        elif quantize == "hqq-2bit":
+            quant_config = BaseQuantizeConfig(nbits=2, group_size=16, quant_zero=True, quant_scale=False)
+
+        # init nn.linear from weight and bias
+        layer = nn.Linear(weight.shape[1], weight.shape[0], bias=bias is not None)
+        with torch.no_grad():
+            layer.weight.data = weight
+            if bias is not None:
+                layer.bias.data = bias
+
+        linear = HQQLinearLayer(layer, quant_config, del_orig=True)
     else:
         raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
     return linear

diff --git a/server/poetry.lock b/server/poetry.lock
diff --git a/server/pyproject.toml b/server/pyproject.toml
@@ -27,23 +27,24 @@ hf-transfer = "^0.1.2"
 sentencepiece = "^0.1.97"
 tokenizers = "0.15.0"
 huggingface-hub = "^0.19.4"
-transformers = "4.36.0"
+transformers = "^4.36.1"
 einops = "^0.6.1"
 tiktoken = "^0.5.2"
 texttable = { version = "^1.6.7", optional = true }
 datasets = { version = "^2.14.0", optional = true }
-torch = {version = "2.1.1", optional = true }
-peft = {version = "0.4.0", optional = true }
+torch = { version = "2.1.2", optional = true }
+peft = { version = "0.4.0", optional = true }
 boto3 = "^1.28.34"
 urllib3 = "<=1.26.18"
+hqq = { version = "^0.1.2", optional = true }
 stanford-stk = { version = "^0.0.6", markers = "sys_platform == 'linux'" }
 
 [tool.poetry.extras]
 torch = ["torch"]
 accelerate = ["accelerate"]
 bnb = ["bitsandbytes"]
 peft = ["peft"]
-quantize = ["texttable", "datasets", "accelerate"]
+quantize = ["texttable", "datasets", "accelerate", "hqq"]
 
 [tool.poetry.group.dev.dependencies]
 grpcio-tools = "^1.51.1"

diff --git a/server/requirements.txt b/server/requirements.txt
@@ -47,7 +47,7 @@ stanford-stk==0.0.6 ; python_version >= "3.9" and python_version < "4.0" and sys
 tiktoken==0.5.2 ; python_version >= "3.9" and python_version < "4.0"
 tokenizers==0.15.0 ; python_version >= "3.9" and python_version < "4.0"
 tqdm==4.66.1 ; python_version >= "3.9" and python_version < "4.0"
-transformers==4.36.0 ; python_version >= "3.9" and python_version < "4.0"
+transformers==4.36.2 ; python_version >= "3.9" and python_version < "4.0"
 triton==2.1.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "linux"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "4.0"
 typing-extensions==4.9.0 ; python_version >= "3.9" and python_version < "4.0"