4 bit cache support is now active

dottxt-ai · Mar 8, 2024 · be528af · be528af
1 parent e33d344
commit be528af
Showing 1 changed file with 4 additions and 3 deletions.
diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
@@ -143,10 +143,11 @@ def exl2(
     """
 
     try:
-        from exllamav2 import (  # , ExLlamaV2Cache_Q4
+        from exllamav2 import (
             ExLlamaV2,
             ExLlamaV2Cache,
             ExLlamaV2Cache_8bit,
+            ExLlamaV2Cache_Q4,
             ExLlamaV2Config,
         )
         from transformers import AutoTokenizer
@@ -209,8 +210,8 @@ def exl2(
 
     if cache_8bit:
         cache = ExLlamaV2Cache_8bit(model, lazy=not model.loaded)
-    # elif cache_q4:
-    #     cache = ExLlamaV2Cache_Q4(model, lazy = not model.loaded)
+    elif cache_q4:
+        cache = ExLlamaV2Cache_Q4(model, lazy=not model.loaded)
     else:
         cache = ExLlamaV2Cache(model, lazy=not model.loaded)