Refactored exl2 method to add LoRA, 8bit cache, and other features su…

…pported by exllama (#729) Refactored the exl2 function in exllamav2.py. The new version offers the following benefits: 1. auto split support. You no longer need to split a large model over 2 GPUs manually, exllama will do it for you 2. 8 bit cache support. Supports the 8 bit cache, can squeeze more context into the same GPU 3. Additional exllamav2 improvements. Supports low_mem, fasttensors. 4. No longer need to pass in num_experts, it is optional. 5. Future support for 4 bit cache. Whenever turbo updates the pip package, uncomment the 4 bit lines for 4 bit support. 6. Refactored the function parameters. Changed the model_kwargs dictionary to individual parameters. Combined with documentation this makes it easier for new users to understand what options they can select.
dottxt-ai · Mar 13, 2024 · 03c71f7 · 03c71f7
1 parent d47bd6b
commit 03c71f7
Showing 1 changed file with 125 additions and 27 deletions.
diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
@@ -1,13 +1,14 @@
+import os
 from typing import TYPE_CHECKING, Optional
 
 import torch
 
-from .transformers import TransformerTokenizer
-
 if TYPE_CHECKING:
-    from exllamav2 import ExLlamaV2, ExLlamaV2Cache
+    from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora
     from transformers import PreTrainedTokenizer
 
+from .transformers import TransformerTokenizer
+
 
 class ExLlamaV2Model:
     """Represents a `exl2` model."""
@@ -18,12 +19,14 @@ def __init__(
         tokenizer: "PreTrainedTokenizer",
         device,
         cache: "ExLlamaV2Cache",
+        lora: Optional["ExLlamaV2Lora"] = None,
     ):
         self.device = device
         self.model = model
         self.tokenizer = TransformerTokenizer(tokenizer)
         self.cache = cache
         self.past_seq = None
+        self.lora = lora
 
     def forward(self, input_ids: torch.LongTensor, *_):
         """Compute a forward pass through the exl2 model."""
@@ -50,6 +53,7 @@ def forward(self, input_ids: torch.LongTensor, *_):
                         seq_tensor[longest_prefix:-1].view(1, -1),
                         self.cache,
                         preprocess_only=True,
+                        loras=[self.lora],
                     )
                 elif seq_tensor.shape[0] == longest_prefix:
                     self.cache.current_seq_len -= 1
@@ -61,58 +65,152 @@ def forward(self, input_ids: torch.LongTensor, *_):
                     seq_tensor[:-1].view(1, -1),
                     self.cache,
                     preprocess_only=True,
+                    loras=[self.lora],
                 )
 
         self.past_seq = seq_tensor
 
-        return self.model.forward(seq_tensor[-1:].view(1, -1), self.cache)
+        return self.model.forward(
+            seq_tensor[-1:].view(1, -1), self.cache, loras=[self.lora]
+        )
 
     def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor:
         logits = self.forward(input_ids)
         next_token_logits = logits[..., -1, :]
 
         return next_token_logits, None
 
+    def update_lora(self, lora_path: Optional[str] = None):
+        """
+        Update and apply the LoRA to the model.
+
+        Args:
+            lora_path (Optional[str]): The path to the LoRA directory. If None, the LoRA will be unloaded.
+        """
+        try:
+            from exllamav2 import ExLlamaV2Lora
+        except ImportError:
+            raise ImportError(
+                "The `exllamav2` library needs to be installed in order to use `exllamav2` models."
+            )
+        if lora_path is None:
+            if self.lora is not None:
+                print(" -- Unloading LoRA...")
+            self.lora = None
+        else:
+            self.lora = ExLlamaV2Lora.from_directory(self.model, lora_path)
+            print(" -- Loading LoRA...")
+
 
 def exl2(
     model_path: str,
-    device: Optional[str] = None,
-    model_kwargs: dict = {},
+    device: str,
+    max_seq_len: Optional[int] = None,
+    scale_pos_emb: Optional[float] = None,
+    scale_alpha_value: Optional[float] = None,
+    no_flash_attn: Optional[bool] = None,
+    num_experts_per_token: Optional[int] = None,
+    cache_8bit: bool = False,
+    cache_q4: bool = False,
     tokenizer_kwargs: dict = {},
-):
+    gpu_split: Optional[str] = None,
+    low_mem: Optional[bool] = None,
+    verbose: Optional[bool] = None,
+) -> ExLlamaV2Model:
+    """
+    Load an ExLlamaV2 model.
+
+    Args:
+        model_path (str): Path to the model directory.
+        device (str): Device to load the model on. Pass in 'cuda' for GPU or 'cpu' for CPU
+        max_seq_len (Optional[int], optional): Maximum sequence length. Defaults to None.
+        scale_pos_emb (Optional[float], optional): Scale factor for positional embeddings. Defaults to None.
+        scale_alpha_value (Optional[float], optional): Scale alpha value. Defaults to None.
+        no_flash_attn (Optional[bool], optional): Disable flash attention. Defaults to None.
+        num_experts_per_token (Optional[int], optional): Number of experts per token. Defaults to None.
+        cache_8bit (bool, optional): Use 8-bit cache. Defaults to False.
+        cache_q4 (bool, optional): Use Q4 cache. Defaults to False.
+        tokenizer_kwargs (dict, optional): Additional keyword arguments for the tokenizer. Defaults to {}.
+        gpu_split (str): \"auto\", or VRAM allocation per GPU in GB. Auto will use exllama's autosplit feature
+        low_mem (bool, optional): Enable VRAM optimizations, potentially trading off speed
+        verbose (bool, optional): Enable if you want debugging statements
+
+    Returns:
+        ExLlamaV2Model: Loaded ExLlamaV2 model.
+
+    Raises:
+        ImportError: If the `exllamav2` library is not installed.
+    """
+
     try:
-        from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config
+        from exllamav2 import (
+            ExLlamaV2,
+            ExLlamaV2Cache,
+            ExLlamaV2Cache_8bit,
+            ExLlamaV2Cache_Q4,
+            ExLlamaV2Config,
+        )
         from transformers import AutoTokenizer
     except ImportError:
         raise ImportError(
             "The `exllamav2` library needs to be installed in order to use `exllamav2` models."
         )
 
+    # Load tokenizer
+    if not verbose:
+        print(" -- Loading tokenizer...")
+    tokenizer_kwargs.setdefault("padding_side", "left")
+    tokenizer = AutoTokenizer.from_pretrained(model_path, **tokenizer_kwargs)
+    # tokenizer = TransformerTokenizer(model_path, **tokenizer_kwargs)
+
+    # Check fasttensors for config
+    if os.name != "nt":
+        use_fasttensors = True
+    else:
+        use_fasttensors = False
+
+    # Create config
     config = ExLlamaV2Config()
     config.model_dir = model_path
+    config.fasttensors = use_fasttensors
     config.prepare()
 
-    config.max_seq_len = model_kwargs.pop("max_seq_len", config.max_seq_len)
-    config.scale_pos_emb = model_kwargs.pop("scale_pos_emb", config.scale_pos_emb)
-    config.scale_alpha_value = model_kwargs.pop(
-        "scale_alpha_value", config.scale_alpha_value
-    )
-    config.no_flash_attn = model_kwargs.pop("no_flash_attn", config.no_flash_attn)
-    config.num_experts_per_token = int(
-        model_kwargs.pop("num_experts_per_token", config.num_experts_per_token)
-    )
-
+    # Set config options
+    if max_seq_len is not None:
+        config.max_seq_len = max_seq_len
+    if scale_pos_emb is not None:
+        config.scale_pos_emb = scale_pos_emb
+    if scale_alpha_value is not None:
+        config.scale_alpha_value = scale_alpha_value
+    if no_flash_attn is not None:
+        config.no_flash_attn = no_flash_attn
+    if num_experts_per_token is not None:
+        config.num_experts_per_token = num_experts_per_token
+    if low_mem:
+        config.set_low_mem()
+
+    # Prepare the model from the config
     model = ExLlamaV2(config)
 
-    split = None
-    if "gpu_split" in model_kwargs.keys():
-        split = [float(alloc) for alloc in model_kwargs["gpu_split"].split(",")]
-
-    model.load(split)
+    # Create cache
+    if cache_8bit:
+        cache = ExLlamaV2Cache_8bit(model, lazy=not model.loaded)
+    elif cache_q4:
+        cache = ExLlamaV2Cache_Q4(model, lazy=not model.loaded)
+    else:
+        cache = ExLlamaV2Cache(model, lazy=not model.loaded)
 
-    tokenizer_kwargs.setdefault("padding_side", "left")
-    tokenizer = AutoTokenizer.from_pretrained(model_path, **tokenizer_kwargs)
-
-    cache = ExLlamaV2Cache(model)
+    # Load the model
+    split = None
+    if gpu_split and gpu_split != "auto":
+        split = [float(alloc) for alloc in gpu_split.split(",")]
+        if not verbose:
+            print(" -- Loading model...")
+        model.load(split)
+
+    # Autoload if no GPU split was provided
+    if not model.loaded:
+        print(" -- Loading model...")
+        model.load_autosplit(cache)
 
     return ExLlamaV2Model(model, tokenizer, device, cache)