From 14d01609bddd80082e56168fc90a32e098c5fc2b Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Tue, 5 Mar 2024 18:48:35 -0600
Subject: [PATCH 01/25] Refactored exl2 method to add in more features
 supported by the exllamav2 library

---
 outlines/models/exllamav2.py | 109 +++++++++++++++++++++++++++++------
 1 file changed, 92 insertions(+), 17 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index b06e5e60a..22a52f8cb 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -1,3 +1,4 @@
+import os
 from typing import TYPE_CHECKING, Optional
 
 import torch
@@ -76,43 +77,117 @@ def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor:
 
 def exl2(
     model_path: str,
-    device: Optional[str] = None,
-    model_kwargs: dict = {},
+    device: Optional[torch.device] = None,
+    max_seq_len: Optional[int] = None,
+    scale_pos_emb: Optional[float] = None,
+    scale_alpha_value: Optional[float] = None,
+    no_flash_attn: Optional[bool] = None,
+    num_experts_per_token: Optional[int] = None,
+    cache_8bit: bool = False,
+    cache_q4: bool = False,
     tokenizer_kwargs: dict = {},
-):
+    gpu_split: Optional[str] = None,
+    low_mem: Optional[bool] = None,
+    verbose: Optional[bool] = None,
+) -> ExLlamaV2Model:
+    """
+    Load an ExLlamaV2 model.
+
+    Args:
+        model_path (str): Path to the model directory.
+        device (Optional[torch.device], optional): Device to load the model on. Defaults to None.
+        max_seq_len (Optional[int], optional): Maximum sequence length. Defaults to None.
+        scale_pos_emb (Optional[float], optional): Scale factor for positional embeddings. Defaults to None.
+        scale_alpha_value (Optional[float], optional): Scale alpha value. Defaults to None.
+        no_flash_attn (Optional[bool], optional): Disable flash attention. Defaults to None.
+        num_experts_per_token (Optional[int], optional): Number of experts per token. Defaults to None.
+        cache_8bit (bool, optional): Use 8-bit cache. Defaults to False.
+        cache_q4 (bool, optional): Use Q4 cache. Defaults to False.
+        tokenizer_kwargs (dict, optional): Additional keyword arguments for the tokenizer. Defaults to {}.
+        gpu_split (str): \"auto\", or VRAM allocation per GPU in GB. Auto will use exllama's autosplit feature
+        low_mem (bool, optional): Enable VRAM optimizations, potentially trading off speed
+        verbose (bool, optional): Enable if you want debugging statements
+
+    Returns:
+        ExLlamaV2Model: Loaded ExLlamaV2 model.
+
+    Raises:
+        ImportError: If the `exllamav2` library is not installed.
+    """
+
     try:
-        from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config
+        from exllamav2 import (  # , ExLlamaV2Cache_Q4
+            ExLlamaV2,
+            ExLlamaV2Cache,
+            ExLlamaV2Cache_8bit,
+            ExLlamaV2Config,
+        )
         from transformers import AutoTokenizer
     except ImportError:
         raise ImportError(
             "The `exllamav2` library needs to be installed in order to use `exllamav2` models."
         )
 
+    if os.name != "nt":
+        use_fasttensors = True
+    else:
+        use_fasttensors = False
+
+    # Create config
+
     config = ExLlamaV2Config()
     config.model_dir = model_path
+    config.fasttensors = use_fasttensors
     config.prepare()
 
-    config.max_seq_len = model_kwargs.pop("max_seq_len", config.max_seq_len)
-    config.scale_pos_emb = model_kwargs.pop("scale_pos_emb", config.scale_pos_emb)
-    config.scale_alpha_value = model_kwargs.pop(
-        "scale_alpha_value", config.scale_alpha_value
-    )
-    config.no_flash_attn = model_kwargs.pop("no_flash_attn", config.no_flash_attn)
-    config.num_experts_per_token = int(
-        model_kwargs.pop("num_experts_per_token", config.num_experts_per_token)
-    )
+    # Set config options
+
+    config.max_seq_len = max_seq_len
+    config.scale_pos_emb = scale_pos_emb
+    config.scale_alpha_value = scale_alpha_value
+    config.no_flash_attn = no_flash_attn
+    if num_experts_per_token:
+        config.num_experts_per_token = num_experts_per_token
+    if low_mem:
+        config.set_low_mem()
+
+    # Load the model
 
     model = ExLlamaV2(config)
 
     split = None
-    if "gpu_split" in model_kwargs.keys():
-        split = [float(alloc) for alloc in model_kwargs["gpu_split"].split(",")]
+    if gpu_split and gpu_split != "auto":
+        split = [float(alloc) for alloc in gpu_split.split(",")]
 
-    model.load(split)
+    if gpu_split != "auto":
+        if not verbose:
+            print(" -- Loading model...")
+        model.load(split)
+
+    # Load tokenizer
+
+    if not verbose:
+        print(" -- Loading tokenizer...")
+
+    # tokenizer = ExLlamaV2Tokenizer(config)
 
     tokenizer_kwargs.setdefault("padding_side", "left")
     tokenizer = AutoTokenizer.from_pretrained(model_path, **tokenizer_kwargs)
+    # tokenizer = TransformerTokenizer(model_path, **tokenizer_kwargs)
+
+    # Create cache
+
+    if cache_8bit:
+        cache = ExLlamaV2Cache_8bit(model, lazy=not model.loaded)
+    # elif cache_q4:
+    #     cache = ExLlamaV2Cache_Q4(model, lazy = not model.loaded)
+    else:
+        cache = ExLlamaV2Cache(model, lazy=not model.loaded)
+
+    # Load model now if auto split enabled
 
-    cache = ExLlamaV2Cache(model)
+    if not model.loaded:
+        print(" -- Loading model...")
+        model.load_autosplit(cache)
 
     return ExLlamaV2Model(model, tokenizer, device, cache)

From a46d86a762f072311bc63d04c5d2cb11ebc80f3b Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Wed, 6 Mar 2024 20:11:16 -0600
Subject: [PATCH 02/25] Added LoRA support

---
 outlines/models/exllamav2.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index 22a52f8cb..d38e5e83d 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -6,7 +6,7 @@
 from .transformers import TransformerTokenizer
 
 if TYPE_CHECKING:
-    from exllamav2 import ExLlamaV2, ExLlamaV2Cache
+    from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora
     from transformers import PreTrainedTokenizer
 
 
@@ -19,12 +19,14 @@ def __init__(
         tokenizer: "PreTrainedTokenizer",
         device,
         cache: "ExLlamaV2Cache",
+        lora: Optional["ExLlamaV2Lora"] = None,
     ):
         self.device = device
         self.model = model
         self.tokenizer = TransformerTokenizer(tokenizer)
         self.cache = cache
         self.past_seq = None
+        self.lora = lora
 
     def forward(self, input_ids: torch.LongTensor, *_):
         """Compute a forward pass through the exl2 model."""
@@ -51,6 +53,7 @@ def forward(self, input_ids: torch.LongTensor, *_):
                         seq_tensor[longest_prefix:-1].view(1, -1),
                         self.cache,
                         preprocess_only=True,
+                        loras=self.lora,
                     )
                 elif seq_tensor.shape[0] == longest_prefix:
                     self.cache.current_seq_len -= 1
@@ -62,11 +65,14 @@ def forward(self, input_ids: torch.LongTensor, *_):
                     seq_tensor[:-1].view(1, -1),
                     self.cache,
                     preprocess_only=True,
+                    loras=self.lora,
                 )
 
         self.past_seq = seq_tensor
 
-        return self.model.forward(seq_tensor[-1:].view(1, -1), self.cache)
+        return self.model.forward(
+            seq_tensor[-1:].view(1, -1), self.cache, loras=self.lora
+        )
 
     def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor:
         logits = self.forward(input_ids)
@@ -74,6 +80,11 @@ def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor:
 
         return next_token_logits, None
 
+    def update_lora(self, lora_path):
+        """Update and apply the LoRA to the model. Input the LoRA path"""
+        self.lora = ExLlamaV2Lora.from_directory(self.model, lora_path)
+        print(" -- Loading LoRA...")
+
 
 def exl2(
     model_path: str,

From e0544f0774456bfa0a989e5c88f2fe894a90541f Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Wed, 6 Mar 2024 20:21:13 -0600
Subject: [PATCH 03/25] Added unloading as well

---
 outlines/models/exllamav2.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index d38e5e83d..3294f5b9f 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -80,10 +80,20 @@ def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor:
 
         return next_token_logits, None
 
-    def update_lora(self, lora_path):
-        """Update and apply the LoRA to the model. Input the LoRA path"""
-        self.lora = ExLlamaV2Lora.from_directory(self.model, lora_path)
-        print(" -- Loading LoRA...")
+    def update_lora(self, lora_path: Optional[str] = None):
+        """
+        Update and apply the LoRA to the model.
+
+        Args:
+            lora_path (Optional[str]): The path to the LoRA directory. If None, the LoRA will be unloaded.
+        """
+        if lora_path is None:
+            if self.lora is not None:
+                print(" -- Unloading LoRA...")
+            self.lora = None
+        else:
+            self.lora = ExLlamaV2Lora.from_directory(self.model, lora_path)
+            print(" -- Loading LoRA...")
 
 
 def exl2(

From f533907f9a9f2d8c19d4fb167cca8f1c95ed1aac Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Wed, 6 Mar 2024 20:53:12 -0600
Subject: [PATCH 04/25] fixed LoRA import

---
 outlines/models/exllamav2.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index 3294f5b9f..e43125c42 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -3,12 +3,12 @@
 
 import torch
 
-from .transformers import TransformerTokenizer
-
 if TYPE_CHECKING:
     from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora
     from transformers import PreTrainedTokenizer
 
+from .transformers import TransformerTokenizer
+
 
 class ExLlamaV2Model:
     """Represents a `exl2` model."""
@@ -137,6 +137,7 @@ def exl2(
     """
 
     try:
+        from exllamav2 import ExLlamaV2Lora  # noqa: F401
         from exllamav2 import (  # , ExLlamaV2Cache_Q4
             ExLlamaV2,
             ExLlamaV2Cache,

From 49165f3982b5ff7be2f1968516dec8ce045e8135 Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Wed, 6 Mar 2024 20:57:02 -0600
Subject: [PATCH 05/25] fixed LoRA import

---
 outlines/models/exllamav2.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index e43125c42..6636ee699 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -4,9 +4,11 @@
 import torch
 
 if TYPE_CHECKING:
-    from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora
+    from exllamav2 import ExLlamaV2, ExLlamaV2Cache
     from transformers import PreTrainedTokenizer
 
+from exllamav2 import ExLlamaV2Lora
+
 from .transformers import TransformerTokenizer
 
 
@@ -137,7 +139,6 @@ def exl2(
     """
 
     try:
-        from exllamav2 import ExLlamaV2Lora  # noqa: F401
         from exllamav2 import (  # , ExLlamaV2Cache_Q4
             ExLlamaV2,
             ExLlamaV2Cache,

From 741251dd6317db3f05b0d7ff8648e4221e4e471b Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Wed, 6 Mar 2024 21:06:32 -0600
Subject: [PATCH 06/25] fixed LoRA import

---
 outlines/models/exllamav2.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index 6636ee699..9bfa5d7e0 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -55,7 +55,7 @@ def forward(self, input_ids: torch.LongTensor, *_):
                         seq_tensor[longest_prefix:-1].view(1, -1),
                         self.cache,
                         preprocess_only=True,
-                        loras=self.lora,
+                        loras=[self.lora],
                     )
                 elif seq_tensor.shape[0] == longest_prefix:
                     self.cache.current_seq_len -= 1
@@ -67,13 +67,13 @@ def forward(self, input_ids: torch.LongTensor, *_):
                     seq_tensor[:-1].view(1, -1),
                     self.cache,
                     preprocess_only=True,
-                    loras=self.lora,
+                    loras=[self.lora],
                 )
 
         self.past_seq = seq_tensor
 
         return self.model.forward(
-            seq_tensor[-1:].view(1, -1), self.cache, loras=self.lora
+            seq_tensor[-1:].view(1, -1), self.cache, loras=[self.lora]
         )
 
     def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor:

From d5232d840f2b496c2b8850fa820e3594b158aabf Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Wed, 6 Mar 2024 23:11:33 -0600
Subject: [PATCH 07/25] Made max_seq_len optional again

---
 outlines/models/exllamav2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index 9bfa5d7e0..3757acdd8 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -164,8 +164,8 @@ def exl2(
     config.prepare()
 
     # Set config options
-
-    config.max_seq_len = max_seq_len
+    if max_seq_len is not None:
+        config.max_seq_len = max_seq_len
     config.scale_pos_emb = scale_pos_emb
     config.scale_alpha_value = scale_alpha_value
     config.no_flash_attn = no_flash_attn

From 5caa973ba36e6c520a0dc70f9d434d10bb07f0c4 Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Thu, 7 Mar 2024 00:17:31 -0600
Subject: [PATCH 08/25] Made remaining params optional

---
 outlines/models/exllamav2.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index 3757acdd8..f4cef5aff 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -166,10 +166,13 @@ def exl2(
     # Set config options
     if max_seq_len is not None:
         config.max_seq_len = max_seq_len
-    config.scale_pos_emb = scale_pos_emb
-    config.scale_alpha_value = scale_alpha_value
-    config.no_flash_attn = no_flash_attn
-    if num_experts_per_token:
+    if scale_pos_emb is not None:
+        config.scale_pos_emb = scale_pos_emb
+    if scale_alpha_value is not None:
+        config.scale_alpha_value = scale_alpha_value
+    if no_flash_attn is not None:
+        config.no_flash_attn = no_flash_attn
+    if num_experts_per_token is not None:
         config.num_experts_per_token = num_experts_per_token
     if low_mem:
         config.set_low_mem()

From 1be48586d7b6be7986bec234fef2877c1df949ab Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Thu, 7 Mar 2024 00:26:12 -0600
Subject: [PATCH 09/25] Removed optional flag on device. Even before my changes
 it would crash without inputting a device.

---
 outlines/models/exllamav2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index f4cef5aff..6aed00f0c 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -100,7 +100,7 @@ def update_lora(self, lora_path: Optional[str] = None):
 
 def exl2(
     model_path: str,
-    device: Optional[torch.device] = None,
+    device: torch.device = None,
     max_seq_len: Optional[int] = None,
     scale_pos_emb: Optional[float] = None,
     scale_alpha_value: Optional[float] = None,
@@ -118,7 +118,7 @@ def exl2(
 
     Args:
         model_path (str): Path to the model directory.
-        device (Optional[torch.device], optional): Device to load the model on. Defaults to None.
+        device (torch.device): Device to load the model on. Defaults to None.
         max_seq_len (Optional[int], optional): Maximum sequence length. Defaults to None.
         scale_pos_emb (Optional[float], optional): Scale factor for positional embeddings. Defaults to None.
         scale_alpha_value (Optional[float], optional): Scale alpha value. Defaults to None.

From f75ff1fa37d46c0f8fe34034e1a0b0ce98b15b63 Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Thu, 7 Mar 2024 00:30:34 -0600
Subject: [PATCH 10/25] Fixed type check

---
 outlines/models/exllamav2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index 6aed00f0c..54241b3f3 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -100,7 +100,7 @@ def update_lora(self, lora_path: Optional[str] = None):
 
 def exl2(
     model_path: str,
-    device: torch.device = None,
+    device: str,
     max_seq_len: Optional[int] = None,
     scale_pos_emb: Optional[float] = None,
     scale_alpha_value: Optional[float] = None,
@@ -118,7 +118,7 @@ def exl2(
 
     Args:
         model_path (str): Path to the model directory.
-        device (torch.device): Device to load the model on. Defaults to None.
+        device (str): Device to load the model on. Pass in 'cuda' for GPU or 'cpu' for CPU
         max_seq_len (Optional[int], optional): Maximum sequence length. Defaults to None.
         scale_pos_emb (Optional[float], optional): Scale factor for positional embeddings. Defaults to None.
         scale_alpha_value (Optional[float], optional): Scale alpha value. Defaults to None.

From e33d344c95597bcee94d049c069cf888376bb7b5 Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Thu, 7 Mar 2024 12:53:37 -0600
Subject: [PATCH 11/25] Fixed the input error

---
 outlines/models/exllamav2.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index 54241b3f3..dd0a60513 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -4,11 +4,9 @@
 import torch
 
 if TYPE_CHECKING:
-    from exllamav2 import ExLlamaV2, ExLlamaV2Cache
+    from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora
     from transformers import PreTrainedTokenizer
 
-from exllamav2 import ExLlamaV2Lora
-
 from .transformers import TransformerTokenizer
 
 
@@ -89,6 +87,12 @@ def update_lora(self, lora_path: Optional[str] = None):
         Args:
             lora_path (Optional[str]): The path to the LoRA directory. If None, the LoRA will be unloaded.
         """
+        try:
+            from exllamav2 import ExLlamaV2Lora
+        except ImportError:
+            raise ImportError(
+                "The `exllamav2` library needs to be installed in order to use `exllamav2` models."
+            )
         if lora_path is None:
             if self.lora is not None:
                 print(" -- Unloading LoRA...")

From be528af227838cf8c7ea62244fe31fc31bf3c302 Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Thu, 7 Mar 2024 20:12:00 -0600
Subject: [PATCH 12/25] 4 bit cache support is now active

---
 outlines/models/exllamav2.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index dd0a60513..0a97223a7 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -143,10 +143,11 @@ def exl2(
     """
 
     try:
-        from exllamav2 import (  # , ExLlamaV2Cache_Q4
+        from exllamav2 import (
             ExLlamaV2,
             ExLlamaV2Cache,
             ExLlamaV2Cache_8bit,
+            ExLlamaV2Cache_Q4,
             ExLlamaV2Config,
         )
         from transformers import AutoTokenizer
@@ -209,8 +210,8 @@ def exl2(
 
     if cache_8bit:
         cache = ExLlamaV2Cache_8bit(model, lazy=not model.loaded)
-    # elif cache_q4:
-    #     cache = ExLlamaV2Cache_Q4(model, lazy = not model.loaded)
+    elif cache_q4:
+        cache = ExLlamaV2Cache_Q4(model, lazy=not model.loaded)
     else:
         cache = ExLlamaV2Cache(model, lazy=not model.loaded)
 

From bd984ad414f7b9a45832717fce2131b35d301e1d Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Tue, 5 Mar 2024 18:48:35 -0600
Subject: [PATCH 13/25] Refactored exl2 method to add in more features
 supported by the exllamav2 library

---
 outlines/models/exllamav2.py | 109 +++++++++++++++++++++++++++++------
 1 file changed, 92 insertions(+), 17 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index b06e5e60a..22a52f8cb 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -1,3 +1,4 @@
+import os
 from typing import TYPE_CHECKING, Optional
 
 import torch
@@ -76,43 +77,117 @@ def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor:
 
 def exl2(
     model_path: str,
-    device: Optional[str] = None,
-    model_kwargs: dict = {},
+    device: Optional[torch.device] = None,
+    max_seq_len: Optional[int] = None,
+    scale_pos_emb: Optional[float] = None,
+    scale_alpha_value: Optional[float] = None,
+    no_flash_attn: Optional[bool] = None,
+    num_experts_per_token: Optional[int] = None,
+    cache_8bit: bool = False,
+    cache_q4: bool = False,
     tokenizer_kwargs: dict = {},
-):
+    gpu_split: Optional[str] = None,
+    low_mem: Optional[bool] = None,
+    verbose: Optional[bool] = None,
+) -> ExLlamaV2Model:
+    """
+    Load an ExLlamaV2 model.
+
+    Args:
+        model_path (str): Path to the model directory.
+        device (Optional[torch.device], optional): Device to load the model on. Defaults to None.
+        max_seq_len (Optional[int], optional): Maximum sequence length. Defaults to None.
+        scale_pos_emb (Optional[float], optional): Scale factor for positional embeddings. Defaults to None.
+        scale_alpha_value (Optional[float], optional): Scale alpha value. Defaults to None.
+        no_flash_attn (Optional[bool], optional): Disable flash attention. Defaults to None.
+        num_experts_per_token (Optional[int], optional): Number of experts per token. Defaults to None.
+        cache_8bit (bool, optional): Use 8-bit cache. Defaults to False.
+        cache_q4 (bool, optional): Use Q4 cache. Defaults to False.
+        tokenizer_kwargs (dict, optional): Additional keyword arguments for the tokenizer. Defaults to {}.
+        gpu_split (str): \"auto\", or VRAM allocation per GPU in GB. Auto will use exllama's autosplit feature
+        low_mem (bool, optional): Enable VRAM optimizations, potentially trading off speed
+        verbose (bool, optional): Enable if you want debugging statements
+
+    Returns:
+        ExLlamaV2Model: Loaded ExLlamaV2 model.
+
+    Raises:
+        ImportError: If the `exllamav2` library is not installed.
+    """
+
     try:
-        from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config
+        from exllamav2 import (  # , ExLlamaV2Cache_Q4
+            ExLlamaV2,
+            ExLlamaV2Cache,
+            ExLlamaV2Cache_8bit,
+            ExLlamaV2Config,
+        )
         from transformers import AutoTokenizer
     except ImportError:
         raise ImportError(
             "The `exllamav2` library needs to be installed in order to use `exllamav2` models."
         )
 
+    if os.name != "nt":
+        use_fasttensors = True
+    else:
+        use_fasttensors = False
+
+    # Create config
+
     config = ExLlamaV2Config()
     config.model_dir = model_path
+    config.fasttensors = use_fasttensors
     config.prepare()
 
-    config.max_seq_len = model_kwargs.pop("max_seq_len", config.max_seq_len)
-    config.scale_pos_emb = model_kwargs.pop("scale_pos_emb", config.scale_pos_emb)
-    config.scale_alpha_value = model_kwargs.pop(
-        "scale_alpha_value", config.scale_alpha_value
-    )
-    config.no_flash_attn = model_kwargs.pop("no_flash_attn", config.no_flash_attn)
-    config.num_experts_per_token = int(
-        model_kwargs.pop("num_experts_per_token", config.num_experts_per_token)
-    )
+    # Set config options
+
+    config.max_seq_len = max_seq_len
+    config.scale_pos_emb = scale_pos_emb
+    config.scale_alpha_value = scale_alpha_value
+    config.no_flash_attn = no_flash_attn
+    if num_experts_per_token:
+        config.num_experts_per_token = num_experts_per_token
+    if low_mem:
+        config.set_low_mem()
+
+    # Load the model
 
     model = ExLlamaV2(config)
 
     split = None
-    if "gpu_split" in model_kwargs.keys():
-        split = [float(alloc) for alloc in model_kwargs["gpu_split"].split(",")]
+    if gpu_split and gpu_split != "auto":
+        split = [float(alloc) for alloc in gpu_split.split(",")]
 
-    model.load(split)
+    if gpu_split != "auto":
+        if not verbose:
+            print(" -- Loading model...")
+        model.load(split)
+
+    # Load tokenizer
+
+    if not verbose:
+        print(" -- Loading tokenizer...")
+
+    # tokenizer = ExLlamaV2Tokenizer(config)
 
     tokenizer_kwargs.setdefault("padding_side", "left")
     tokenizer = AutoTokenizer.from_pretrained(model_path, **tokenizer_kwargs)
+    # tokenizer = TransformerTokenizer(model_path, **tokenizer_kwargs)
+
+    # Create cache
+
+    if cache_8bit:
+        cache = ExLlamaV2Cache_8bit(model, lazy=not model.loaded)
+    # elif cache_q4:
+    #     cache = ExLlamaV2Cache_Q4(model, lazy = not model.loaded)
+    else:
+        cache = ExLlamaV2Cache(model, lazy=not model.loaded)
+
+    # Load model now if auto split enabled
 
-    cache = ExLlamaV2Cache(model)
+    if not model.loaded:
+        print(" -- Loading model...")
+        model.load_autosplit(cache)
 
     return ExLlamaV2Model(model, tokenizer, device, cache)

From aa995113329a298341a91a0098aa4485f59a1b0c Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Wed, 6 Mar 2024 20:11:16 -0600
Subject: [PATCH 14/25] Added LoRA support

---
 outlines/models/exllamav2.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index 22a52f8cb..d38e5e83d 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -6,7 +6,7 @@
 from .transformers import TransformerTokenizer
 
 if TYPE_CHECKING:
-    from exllamav2 import ExLlamaV2, ExLlamaV2Cache
+    from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora
     from transformers import PreTrainedTokenizer
 
 
@@ -19,12 +19,14 @@ def __init__(
         tokenizer: "PreTrainedTokenizer",
         device,
         cache: "ExLlamaV2Cache",
+        lora: Optional["ExLlamaV2Lora"] = None,
     ):
         self.device = device
         self.model = model
         self.tokenizer = TransformerTokenizer(tokenizer)
         self.cache = cache
         self.past_seq = None
+        self.lora = lora
 
     def forward(self, input_ids: torch.LongTensor, *_):
         """Compute a forward pass through the exl2 model."""
@@ -51,6 +53,7 @@ def forward(self, input_ids: torch.LongTensor, *_):
                         seq_tensor[longest_prefix:-1].view(1, -1),
                         self.cache,
                         preprocess_only=True,
+                        loras=self.lora,
                     )
                 elif seq_tensor.shape[0] == longest_prefix:
                     self.cache.current_seq_len -= 1
@@ -62,11 +65,14 @@ def forward(self, input_ids: torch.LongTensor, *_):
                     seq_tensor[:-1].view(1, -1),
                     self.cache,
                     preprocess_only=True,
+                    loras=self.lora,
                 )
 
         self.past_seq = seq_tensor
 
-        return self.model.forward(seq_tensor[-1:].view(1, -1), self.cache)
+        return self.model.forward(
+            seq_tensor[-1:].view(1, -1), self.cache, loras=self.lora
+        )
 
     def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor:
         logits = self.forward(input_ids)
@@ -74,6 +80,11 @@ def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor:
 
         return next_token_logits, None
 
+    def update_lora(self, lora_path):
+        """Update and apply the LoRA to the model. Input the LoRA path"""
+        self.lora = ExLlamaV2Lora.from_directory(self.model, lora_path)
+        print(" -- Loading LoRA...")
+
 
 def exl2(
     model_path: str,

From eb0bbfbd33a9fdbc716e344eb3252f9317d5d5ea Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Wed, 6 Mar 2024 20:21:13 -0600
Subject: [PATCH 15/25] Added unloading as well

---
 outlines/models/exllamav2.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index d38e5e83d..3294f5b9f 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -80,10 +80,20 @@ def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor:
 
         return next_token_logits, None
 
-    def update_lora(self, lora_path):
-        """Update and apply the LoRA to the model. Input the LoRA path"""
-        self.lora = ExLlamaV2Lora.from_directory(self.model, lora_path)
-        print(" -- Loading LoRA...")
+    def update_lora(self, lora_path: Optional[str] = None):
+        """
+        Update and apply the LoRA to the model.
+
+        Args:
+            lora_path (Optional[str]): The path to the LoRA directory. If None, the LoRA will be unloaded.
+        """
+        if lora_path is None:
+            if self.lora is not None:
+                print(" -- Unloading LoRA...")
+            self.lora = None
+        else:
+            self.lora = ExLlamaV2Lora.from_directory(self.model, lora_path)
+            print(" -- Loading LoRA...")
 
 
 def exl2(

From b7bd216cfc28b1dece607c0b48b047599979fa7c Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Wed, 6 Mar 2024 20:53:12 -0600
Subject: [PATCH 16/25] fixed LoRA import

---
 outlines/models/exllamav2.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index 3294f5b9f..e43125c42 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -3,12 +3,12 @@
 
 import torch
 
-from .transformers import TransformerTokenizer
-
 if TYPE_CHECKING:
     from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora
     from transformers import PreTrainedTokenizer
 
+from .transformers import TransformerTokenizer
+
 
 class ExLlamaV2Model:
     """Represents a `exl2` model."""
@@ -137,6 +137,7 @@ def exl2(
     """
 
     try:
+        from exllamav2 import ExLlamaV2Lora  # noqa: F401
         from exllamav2 import (  # , ExLlamaV2Cache_Q4
             ExLlamaV2,
             ExLlamaV2Cache,

From 9b720a3df2fd537bacb438e195672354fce96b3f Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Wed, 6 Mar 2024 20:57:02 -0600
Subject: [PATCH 17/25] fixed LoRA import

---
 outlines/models/exllamav2.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index e43125c42..6636ee699 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -4,9 +4,11 @@
 import torch
 
 if TYPE_CHECKING:
-    from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora
+    from exllamav2 import ExLlamaV2, ExLlamaV2Cache
     from transformers import PreTrainedTokenizer
 
+from exllamav2 import ExLlamaV2Lora
+
 from .transformers import TransformerTokenizer
 
 
@@ -137,7 +139,6 @@ def exl2(
     """
 
     try:
-        from exllamav2 import ExLlamaV2Lora  # noqa: F401
         from exllamav2 import (  # , ExLlamaV2Cache_Q4
             ExLlamaV2,
             ExLlamaV2Cache,

From f7d53246bba2af98db0b69e52c5479fa6bddef8e Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Wed, 6 Mar 2024 21:06:32 -0600
Subject: [PATCH 18/25] fixed LoRA import

---
 outlines/models/exllamav2.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index 6636ee699..9bfa5d7e0 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -55,7 +55,7 @@ def forward(self, input_ids: torch.LongTensor, *_):
                         seq_tensor[longest_prefix:-1].view(1, -1),
                         self.cache,
                         preprocess_only=True,
-                        loras=self.lora,
+                        loras=[self.lora],
                     )
                 elif seq_tensor.shape[0] == longest_prefix:
                     self.cache.current_seq_len -= 1
@@ -67,13 +67,13 @@ def forward(self, input_ids: torch.LongTensor, *_):
                     seq_tensor[:-1].view(1, -1),
                     self.cache,
                     preprocess_only=True,
-                    loras=self.lora,
+                    loras=[self.lora],
                 )
 
         self.past_seq = seq_tensor
 
         return self.model.forward(
-            seq_tensor[-1:].view(1, -1), self.cache, loras=self.lora
+            seq_tensor[-1:].view(1, -1), self.cache, loras=[self.lora]
         )
 
     def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor:

From e62a70b4eab47fb55ef78d7c07df9cdc93ed44d3 Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Wed, 6 Mar 2024 23:11:33 -0600
Subject: [PATCH 19/25] Made max_seq_len optional again

---
 outlines/models/exllamav2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index 9bfa5d7e0..3757acdd8 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -164,8 +164,8 @@ def exl2(
     config.prepare()
 
     # Set config options
-
-    config.max_seq_len = max_seq_len
+    if max_seq_len is not None:
+        config.max_seq_len = max_seq_len
     config.scale_pos_emb = scale_pos_emb
     config.scale_alpha_value = scale_alpha_value
     config.no_flash_attn = no_flash_attn

From 29135ee8cfbe950edb1e3b63a96ad3021e3198e8 Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Thu, 7 Mar 2024 00:17:31 -0600
Subject: [PATCH 20/25] Made remaining params optional

---
 outlines/models/exllamav2.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index 3757acdd8..f4cef5aff 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -166,10 +166,13 @@ def exl2(
     # Set config options
     if max_seq_len is not None:
         config.max_seq_len = max_seq_len
-    config.scale_pos_emb = scale_pos_emb
-    config.scale_alpha_value = scale_alpha_value
-    config.no_flash_attn = no_flash_attn
-    if num_experts_per_token:
+    if scale_pos_emb is not None:
+        config.scale_pos_emb = scale_pos_emb
+    if scale_alpha_value is not None:
+        config.scale_alpha_value = scale_alpha_value
+    if no_flash_attn is not None:
+        config.no_flash_attn = no_flash_attn
+    if num_experts_per_token is not None:
         config.num_experts_per_token = num_experts_per_token
     if low_mem:
         config.set_low_mem()

From ec38d6e48a48f39005f5c249cf45e1820229d095 Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Thu, 7 Mar 2024 00:26:12 -0600
Subject: [PATCH 21/25] Removed optional flag on device. Even before my changes
 it would crash without inputting a device.

---
 outlines/models/exllamav2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index f4cef5aff..6aed00f0c 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -100,7 +100,7 @@ def update_lora(self, lora_path: Optional[str] = None):
 
 def exl2(
     model_path: str,
-    device: Optional[torch.device] = None,
+    device: torch.device = None,
     max_seq_len: Optional[int] = None,
     scale_pos_emb: Optional[float] = None,
     scale_alpha_value: Optional[float] = None,
@@ -118,7 +118,7 @@ def exl2(
 
     Args:
         model_path (str): Path to the model directory.
-        device (Optional[torch.device], optional): Device to load the model on. Defaults to None.
+        device (torch.device): Device to load the model on. Defaults to None.
         max_seq_len (Optional[int], optional): Maximum sequence length. Defaults to None.
         scale_pos_emb (Optional[float], optional): Scale factor for positional embeddings. Defaults to None.
         scale_alpha_value (Optional[float], optional): Scale alpha value. Defaults to None.

From 8d51cb5214ae2dda6230717d85b8957d2701f6a4 Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Thu, 7 Mar 2024 00:30:34 -0600
Subject: [PATCH 22/25] Fixed type check

---
 outlines/models/exllamav2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index 6aed00f0c..54241b3f3 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -100,7 +100,7 @@ def update_lora(self, lora_path: Optional[str] = None):
 
 def exl2(
     model_path: str,
-    device: torch.device = None,
+    device: str,
     max_seq_len: Optional[int] = None,
     scale_pos_emb: Optional[float] = None,
     scale_alpha_value: Optional[float] = None,
@@ -118,7 +118,7 @@ def exl2(
 
     Args:
         model_path (str): Path to the model directory.
-        device (torch.device): Device to load the model on. Defaults to None.
+        device (str): Device to load the model on. Pass in 'cuda' for GPU or 'cpu' for CPU
         max_seq_len (Optional[int], optional): Maximum sequence length. Defaults to None.
         scale_pos_emb (Optional[float], optional): Scale factor for positional embeddings. Defaults to None.
         scale_alpha_value (Optional[float], optional): Scale alpha value. Defaults to None.

From f9b6e4405c446ccd7c31789a86df426700bbfbc3 Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Thu, 7 Mar 2024 12:53:37 -0600
Subject: [PATCH 23/25] Fixed the input error

---
 outlines/models/exllamav2.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index 54241b3f3..dd0a60513 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -4,11 +4,9 @@
 import torch
 
 if TYPE_CHECKING:
-    from exllamav2 import ExLlamaV2, ExLlamaV2Cache
+    from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora
     from transformers import PreTrainedTokenizer
 
-from exllamav2 import ExLlamaV2Lora
-
 from .transformers import TransformerTokenizer
 
 
@@ -89,6 +87,12 @@ def update_lora(self, lora_path: Optional[str] = None):
         Args:
             lora_path (Optional[str]): The path to the LoRA directory. If None, the LoRA will be unloaded.
         """
+        try:
+            from exllamav2 import ExLlamaV2Lora
+        except ImportError:
+            raise ImportError(
+                "The `exllamav2` library needs to be installed in order to use `exllamav2` models."
+            )
         if lora_path is None:
             if self.lora is not None:
                 print(" -- Unloading LoRA...")

From 2cf15c26ebe608851ebbede93263d41c40a76092 Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Thu, 7 Mar 2024 20:12:00 -0600
Subject: [PATCH 24/25] 4 bit cache support is now active

---
 outlines/models/exllamav2.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index dd0a60513..0a97223a7 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -143,10 +143,11 @@ def exl2(
     """
 
     try:
-        from exllamav2 import (  # , ExLlamaV2Cache_Q4
+        from exllamav2 import (
             ExLlamaV2,
             ExLlamaV2Cache,
             ExLlamaV2Cache_8bit,
+            ExLlamaV2Cache_Q4,
             ExLlamaV2Config,
         )
         from transformers import AutoTokenizer
@@ -209,8 +210,8 @@ def exl2(
 
     if cache_8bit:
         cache = ExLlamaV2Cache_8bit(model, lazy=not model.loaded)
-    # elif cache_q4:
-    #     cache = ExLlamaV2Cache_Q4(model, lazy = not model.loaded)
+    elif cache_q4:
+        cache = ExLlamaV2Cache_Q4(model, lazy=not model.loaded)
     else:
         cache = ExLlamaV2Cache(model, lazy=not model.loaded)
 

From a8ab1e65fa10b89353f163c22ce2f907f96bd2ef Mon Sep 17 00:00:00 2001
From: psych0v0yager <ian.matejka@gmail.com>
Date: Mon, 11 Mar 2024 16:54:25 -0500
Subject: [PATCH 25/25] Made formatting changes

---
 outlines/models/exllamav2.py | 41 ++++++++++++++----------------------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py
index 0a97223a7..dc92287ba 100644
--- a/outlines/models/exllamav2.py
+++ b/outlines/models/exllamav2.py
@@ -156,13 +156,19 @@ def exl2(
             "The `exllamav2` library needs to be installed in order to use `exllamav2` models."
         )
 
+    # Load tokenizer
+    if not verbose:
+        print(" -- Loading tokenizer...")
+    tokenizer_kwargs.setdefault("padding_side", "left")
+    tokenizer = AutoTokenizer.from_pretrained(model_path, **tokenizer_kwargs)
+
+    # Check fasttensors for config
     if os.name != "nt":
         use_fasttensors = True
     else:
         use_fasttensors = False
 
     # Create config
-
     config = ExLlamaV2Config()
     config.model_dir = model_path
     config.fasttensors = use_fasttensors
@@ -182,32 +188,10 @@ def exl2(
     if low_mem:
         config.set_low_mem()
 
-    # Load the model
-
+    # Prepare the model from the config
     model = ExLlamaV2(config)
 
-    split = None
-    if gpu_split and gpu_split != "auto":
-        split = [float(alloc) for alloc in gpu_split.split(",")]
-
-    if gpu_split != "auto":
-        if not verbose:
-            print(" -- Loading model...")
-        model.load(split)
-
-    # Load tokenizer
-
-    if not verbose:
-        print(" -- Loading tokenizer...")
-
-    # tokenizer = ExLlamaV2Tokenizer(config)
-
-    tokenizer_kwargs.setdefault("padding_side", "left")
-    tokenizer = AutoTokenizer.from_pretrained(model_path, **tokenizer_kwargs)
-    # tokenizer = TransformerTokenizer(model_path, **tokenizer_kwargs)
-
     # Create cache
-
     if cache_8bit:
         cache = ExLlamaV2Cache_8bit(model, lazy=not model.loaded)
     elif cache_q4:
@@ -215,8 +199,15 @@ def exl2(
     else:
         cache = ExLlamaV2Cache(model, lazy=not model.loaded)
 
-    # Load model now if auto split enabled
+    # Load the model
+    split = None
+    if gpu_split and gpu_split != "auto":
+        split = [float(alloc) for alloc in gpu_split.split(",")]
+        if not verbose:
+            print(" -- Loading model...")
+        model.load(split)
 
+    # Autoload if no GPU split was provided
     if not model.loaded:
         print(" -- Loading model...")
         model.load_autosplit(cache)