From 14d01609bddd80082e56168fc90a32e098c5fc2b Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Tue, 5 Mar 2024 18:48:35 -0600 Subject: [PATCH 01/25] Refactored exl2 method to add in more features supported by the exllamav2 library --- outlines/models/exllamav2.py | 109 +++++++++++++++++++++++++++++------ 1 file changed, 92 insertions(+), 17 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index b06e5e60a..22a52f8cb 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -1,3 +1,4 @@ +import os from typing import TYPE_CHECKING, Optional import torch @@ -76,43 +77,117 @@ def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor: def exl2( model_path: str, - device: Optional[str] = None, - model_kwargs: dict = {}, + device: Optional[torch.device] = None, + max_seq_len: Optional[int] = None, + scale_pos_emb: Optional[float] = None, + scale_alpha_value: Optional[float] = None, + no_flash_attn: Optional[bool] = None, + num_experts_per_token: Optional[int] = None, + cache_8bit: bool = False, + cache_q4: bool = False, tokenizer_kwargs: dict = {}, -): + gpu_split: Optional[str] = None, + low_mem: Optional[bool] = None, + verbose: Optional[bool] = None, +) -> ExLlamaV2Model: + """ + Load an ExLlamaV2 model. + + Args: + model_path (str): Path to the model directory. + device (Optional[torch.device], optional): Device to load the model on. Defaults to None. + max_seq_len (Optional[int], optional): Maximum sequence length. Defaults to None. + scale_pos_emb (Optional[float], optional): Scale factor for positional embeddings. Defaults to None. + scale_alpha_value (Optional[float], optional): Scale alpha value. Defaults to None. + no_flash_attn (Optional[bool], optional): Disable flash attention. Defaults to None. + num_experts_per_token (Optional[int], optional): Number of experts per token. Defaults to None. + cache_8bit (bool, optional): Use 8-bit cache. Defaults to False. + cache_q4 (bool, optional): Use Q4 cache. Defaults to False. + tokenizer_kwargs (dict, optional): Additional keyword arguments for the tokenizer. Defaults to {}. + gpu_split (str): \"auto\", or VRAM allocation per GPU in GB. Auto will use exllama's autosplit feature + low_mem (bool, optional): Enable VRAM optimizations, potentially trading off speed + verbose (bool, optional): Enable if you want debugging statements + + Returns: + ExLlamaV2Model: Loaded ExLlamaV2 model. + + Raises: + ImportError: If the `exllamav2` library is not installed. + """ + try: - from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config + from exllamav2 import ( # , ExLlamaV2Cache_Q4 + ExLlamaV2, + ExLlamaV2Cache, + ExLlamaV2Cache_8bit, + ExLlamaV2Config, + ) from transformers import AutoTokenizer except ImportError: raise ImportError( "The `exllamav2` library needs to be installed in order to use `exllamav2` models." ) + if os.name != "nt": + use_fasttensors = True + else: + use_fasttensors = False + + # Create config + config = ExLlamaV2Config() config.model_dir = model_path + config.fasttensors = use_fasttensors config.prepare() - config.max_seq_len = model_kwargs.pop("max_seq_len", config.max_seq_len) - config.scale_pos_emb = model_kwargs.pop("scale_pos_emb", config.scale_pos_emb) - config.scale_alpha_value = model_kwargs.pop( - "scale_alpha_value", config.scale_alpha_value - ) - config.no_flash_attn = model_kwargs.pop("no_flash_attn", config.no_flash_attn) - config.num_experts_per_token = int( - model_kwargs.pop("num_experts_per_token", config.num_experts_per_token) - ) + # Set config options + + config.max_seq_len = max_seq_len + config.scale_pos_emb = scale_pos_emb + config.scale_alpha_value = scale_alpha_value + config.no_flash_attn = no_flash_attn + if num_experts_per_token: + config.num_experts_per_token = num_experts_per_token + if low_mem: + config.set_low_mem() + + # Load the model model = ExLlamaV2(config) split = None - if "gpu_split" in model_kwargs.keys(): - split = [float(alloc) for alloc in model_kwargs["gpu_split"].split(",")] + if gpu_split and gpu_split != "auto": + split = [float(alloc) for alloc in gpu_split.split(",")] - model.load(split) + if gpu_split != "auto": + if not verbose: + print(" -- Loading model...") + model.load(split) + + # Load tokenizer + + if not verbose: + print(" -- Loading tokenizer...") + + # tokenizer = ExLlamaV2Tokenizer(config) tokenizer_kwargs.setdefault("padding_side", "left") tokenizer = AutoTokenizer.from_pretrained(model_path, **tokenizer_kwargs) + # tokenizer = TransformerTokenizer(model_path, **tokenizer_kwargs) + + # Create cache + + if cache_8bit: + cache = ExLlamaV2Cache_8bit(model, lazy=not model.loaded) + # elif cache_q4: + # cache = ExLlamaV2Cache_Q4(model, lazy = not model.loaded) + else: + cache = ExLlamaV2Cache(model, lazy=not model.loaded) + + # Load model now if auto split enabled - cache = ExLlamaV2Cache(model) + if not model.loaded: + print(" -- Loading model...") + model.load_autosplit(cache) return ExLlamaV2Model(model, tokenizer, device, cache) From a46d86a762f072311bc63d04c5d2cb11ebc80f3b Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Wed, 6 Mar 2024 20:11:16 -0600 Subject: [PATCH 02/25] Added LoRA support --- outlines/models/exllamav2.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 22a52f8cb..d38e5e83d 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -6,7 +6,7 @@ from .transformers import TransformerTokenizer if TYPE_CHECKING: - from exllamav2 import ExLlamaV2, ExLlamaV2Cache + from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora from transformers import PreTrainedTokenizer @@ -19,12 +19,14 @@ def __init__( tokenizer: "PreTrainedTokenizer", device, cache: "ExLlamaV2Cache", + lora: Optional["ExLlamaV2Lora"] = None, ): self.device = device self.model = model self.tokenizer = TransformerTokenizer(tokenizer) self.cache = cache self.past_seq = None + self.lora = lora def forward(self, input_ids: torch.LongTensor, *_): """Compute a forward pass through the exl2 model.""" @@ -51,6 +53,7 @@ def forward(self, input_ids: torch.LongTensor, *_): seq_tensor[longest_prefix:-1].view(1, -1), self.cache, preprocess_only=True, + loras=self.lora, ) elif seq_tensor.shape[0] == longest_prefix: self.cache.current_seq_len -= 1 @@ -62,11 +65,14 @@ def forward(self, input_ids: torch.LongTensor, *_): seq_tensor[:-1].view(1, -1), self.cache, preprocess_only=True, + loras=self.lora, ) self.past_seq = seq_tensor - return self.model.forward(seq_tensor[-1:].view(1, -1), self.cache) + return self.model.forward( + seq_tensor[-1:].view(1, -1), self.cache, loras=self.lora + ) def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor: logits = self.forward(input_ids) @@ -74,6 +80,11 @@ def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor: return next_token_logits, None + def update_lora(self, lora_path): + """Update and apply the LoRA to the model. Input the LoRA path""" + self.lora = ExLlamaV2Lora.from_directory(self.model, lora_path) + print(" -- Loading LoRA...") + def exl2( model_path: str, From e0544f0774456bfa0a989e5c88f2fe894a90541f Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Wed, 6 Mar 2024 20:21:13 -0600 Subject: [PATCH 03/25] Added unloading as well --- outlines/models/exllamav2.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index d38e5e83d..3294f5b9f 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -80,10 +80,20 @@ def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor: return next_token_logits, None - def update_lora(self, lora_path): - """Update and apply the LoRA to the model. Input the LoRA path""" - self.lora = ExLlamaV2Lora.from_directory(self.model, lora_path) - print(" -- Loading LoRA...") + def update_lora(self, lora_path: Optional[str] = None): + """ + Update and apply the LoRA to the model. + + Args: + lora_path (Optional[str]): The path to the LoRA directory. If None, the LoRA will be unloaded. + """ + if lora_path is None: + if self.lora is not None: + print(" -- Unloading LoRA...") + self.lora = None + else: + self.lora = ExLlamaV2Lora.from_directory(self.model, lora_path) + print(" -- Loading LoRA...") def exl2( From f533907f9a9f2d8c19d4fb167cca8f1c95ed1aac Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Wed, 6 Mar 2024 20:53:12 -0600 Subject: [PATCH 04/25] fixed LoRA import --- outlines/models/exllamav2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 3294f5b9f..e43125c42 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -3,12 +3,12 @@ import torch -from .transformers import TransformerTokenizer - if TYPE_CHECKING: from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora from transformers import PreTrainedTokenizer +from .transformers import TransformerTokenizer + class ExLlamaV2Model: """Represents a `exl2` model.""" @@ -137,6 +137,7 @@ def exl2( """ try: + from exllamav2 import ExLlamaV2Lora # noqa: F401 from exllamav2 import ( # , ExLlamaV2Cache_Q4 ExLlamaV2, ExLlamaV2Cache, From 49165f3982b5ff7be2f1968516dec8ce045e8135 Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Wed, 6 Mar 2024 20:57:02 -0600 Subject: [PATCH 05/25] fixed LoRA import --- outlines/models/exllamav2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index e43125c42..6636ee699 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -4,9 +4,11 @@ import torch if TYPE_CHECKING: - from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora + from exllamav2 import ExLlamaV2, ExLlamaV2Cache from transformers import PreTrainedTokenizer +from exllamav2 import ExLlamaV2Lora + from .transformers import TransformerTokenizer @@ -137,7 +139,6 @@ def exl2( """ try: - from exllamav2 import ExLlamaV2Lora # noqa: F401 from exllamav2 import ( # , ExLlamaV2Cache_Q4 ExLlamaV2, ExLlamaV2Cache, From 741251dd6317db3f05b0d7ff8648e4221e4e471b Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Wed, 6 Mar 2024 21:06:32 -0600 Subject: [PATCH 06/25] fixed LoRA import --- outlines/models/exllamav2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 6636ee699..9bfa5d7e0 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -55,7 +55,7 @@ def forward(self, input_ids: torch.LongTensor, *_): seq_tensor[longest_prefix:-1].view(1, -1), self.cache, preprocess_only=True, - loras=self.lora, + loras=[self.lora], ) elif seq_tensor.shape[0] == longest_prefix: self.cache.current_seq_len -= 1 @@ -67,13 +67,13 @@ def forward(self, input_ids: torch.LongTensor, *_): seq_tensor[:-1].view(1, -1), self.cache, preprocess_only=True, - loras=self.lora, + loras=[self.lora], ) self.past_seq = seq_tensor return self.model.forward( - seq_tensor[-1:].view(1, -1), self.cache, loras=self.lora + seq_tensor[-1:].view(1, -1), self.cache, loras=[self.lora] ) def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor: From d5232d840f2b496c2b8850fa820e3594b158aabf Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Wed, 6 Mar 2024 23:11:33 -0600 Subject: [PATCH 07/25] Made max_seq_len optional again --- outlines/models/exllamav2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 9bfa5d7e0..3757acdd8 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -164,8 +164,8 @@ def exl2( config.prepare() # Set config options - - config.max_seq_len = max_seq_len + if max_seq_len is not None: + config.max_seq_len = max_seq_len config.scale_pos_emb = scale_pos_emb config.scale_alpha_value = scale_alpha_value config.no_flash_attn = no_flash_attn From 5caa973ba36e6c520a0dc70f9d434d10bb07f0c4 Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Thu, 7 Mar 2024 00:17:31 -0600 Subject: [PATCH 08/25] Made remaining params optional --- outlines/models/exllamav2.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 3757acdd8..f4cef5aff 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -166,10 +166,13 @@ def exl2( # Set config options if max_seq_len is not None: config.max_seq_len = max_seq_len - config.scale_pos_emb = scale_pos_emb - config.scale_alpha_value = scale_alpha_value - config.no_flash_attn = no_flash_attn - if num_experts_per_token: + if scale_pos_emb is not None: + config.scale_pos_emb = scale_pos_emb + if scale_alpha_value is not None: + config.scale_alpha_value = scale_alpha_value + if no_flash_attn is not None: + config.no_flash_attn = no_flash_attn + if num_experts_per_token is not None: config.num_experts_per_token = num_experts_per_token if low_mem: config.set_low_mem() From 1be48586d7b6be7986bec234fef2877c1df949ab Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Thu, 7 Mar 2024 00:26:12 -0600 Subject: [PATCH 09/25] Removed optional flag on device. Even before my changes it would crash without inputting a device. --- outlines/models/exllamav2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index f4cef5aff..6aed00f0c 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -100,7 +100,7 @@ def update_lora(self, lora_path: Optional[str] = None): def exl2( model_path: str, - device: Optional[torch.device] = None, + device: torch.device = None, max_seq_len: Optional[int] = None, scale_pos_emb: Optional[float] = None, scale_alpha_value: Optional[float] = None, @@ -118,7 +118,7 @@ def exl2( Args: model_path (str): Path to the model directory. - device (Optional[torch.device], optional): Device to load the model on. Defaults to None. + device (torch.device): Device to load the model on. Defaults to None. max_seq_len (Optional[int], optional): Maximum sequence length. Defaults to None. scale_pos_emb (Optional[float], optional): Scale factor for positional embeddings. Defaults to None. scale_alpha_value (Optional[float], optional): Scale alpha value. Defaults to None. From f75ff1fa37d46c0f8fe34034e1a0b0ce98b15b63 Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Thu, 7 Mar 2024 00:30:34 -0600 Subject: [PATCH 10/25] Fixed type check --- outlines/models/exllamav2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 6aed00f0c..54241b3f3 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -100,7 +100,7 @@ def update_lora(self, lora_path: Optional[str] = None): def exl2( model_path: str, - device: torch.device = None, + device: str, max_seq_len: Optional[int] = None, scale_pos_emb: Optional[float] = None, scale_alpha_value: Optional[float] = None, @@ -118,7 +118,7 @@ def exl2( Args: model_path (str): Path to the model directory. - device (torch.device): Device to load the model on. Defaults to None. + device (str): Device to load the model on. Pass in 'cuda' for GPU or 'cpu' for CPU max_seq_len (Optional[int], optional): Maximum sequence length. Defaults to None. scale_pos_emb (Optional[float], optional): Scale factor for positional embeddings. Defaults to None. scale_alpha_value (Optional[float], optional): Scale alpha value. Defaults to None. From e33d344c95597bcee94d049c069cf888376bb7b5 Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Thu, 7 Mar 2024 12:53:37 -0600 Subject: [PATCH 11/25] Fixed the input error --- outlines/models/exllamav2.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 54241b3f3..dd0a60513 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -4,11 +4,9 @@ import torch if TYPE_CHECKING: - from exllamav2 import ExLlamaV2, ExLlamaV2Cache + from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora from transformers import PreTrainedTokenizer -from exllamav2 import ExLlamaV2Lora - from .transformers import TransformerTokenizer @@ -89,6 +87,12 @@ def update_lora(self, lora_path: Optional[str] = None): Args: lora_path (Optional[str]): The path to the LoRA directory. If None, the LoRA will be unloaded. """ + try: + from exllamav2 import ExLlamaV2Lora + except ImportError: + raise ImportError( + "The `exllamav2` library needs to be installed in order to use `exllamav2` models." + ) if lora_path is None: if self.lora is not None: print(" -- Unloading LoRA...") From be528af227838cf8c7ea62244fe31fc31bf3c302 Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Thu, 7 Mar 2024 20:12:00 -0600 Subject: [PATCH 12/25] 4 bit cache support is now active --- outlines/models/exllamav2.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index dd0a60513..0a97223a7 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -143,10 +143,11 @@ def exl2( """ try: - from exllamav2 import ( # , ExLlamaV2Cache_Q4 + from exllamav2 import ( ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Cache_8bit, + ExLlamaV2Cache_Q4, ExLlamaV2Config, ) from transformers import AutoTokenizer @@ -209,8 +210,8 @@ def exl2( if cache_8bit: cache = ExLlamaV2Cache_8bit(model, lazy=not model.loaded) - # elif cache_q4: - # cache = ExLlamaV2Cache_Q4(model, lazy = not model.loaded) + elif cache_q4: + cache = ExLlamaV2Cache_Q4(model, lazy=not model.loaded) else: cache = ExLlamaV2Cache(model, lazy=not model.loaded) From bd984ad414f7b9a45832717fce2131b35d301e1d Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Tue, 5 Mar 2024 18:48:35 -0600 Subject: [PATCH 13/25] Refactored exl2 method to add in more features supported by the exllamav2 library --- outlines/models/exllamav2.py | 109 +++++++++++++++++++++++++++++------ 1 file changed, 92 insertions(+), 17 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index b06e5e60a..22a52f8cb 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -1,3 +1,4 @@ +import os from typing import TYPE_CHECKING, Optional import torch @@ -76,43 +77,117 @@ def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor: def exl2( model_path: str, - device: Optional[str] = None, - model_kwargs: dict = {}, + device: Optional[torch.device] = None, + max_seq_len: Optional[int] = None, + scale_pos_emb: Optional[float] = None, + scale_alpha_value: Optional[float] = None, + no_flash_attn: Optional[bool] = None, + num_experts_per_token: Optional[int] = None, + cache_8bit: bool = False, + cache_q4: bool = False, tokenizer_kwargs: dict = {}, -): + gpu_split: Optional[str] = None, + low_mem: Optional[bool] = None, + verbose: Optional[bool] = None, +) -> ExLlamaV2Model: + """ + Load an ExLlamaV2 model. + + Args: + model_path (str): Path to the model directory. + device (Optional[torch.device], optional): Device to load the model on. Defaults to None. + max_seq_len (Optional[int], optional): Maximum sequence length. Defaults to None. + scale_pos_emb (Optional[float], optional): Scale factor for positional embeddings. Defaults to None. + scale_alpha_value (Optional[float], optional): Scale alpha value. Defaults to None. + no_flash_attn (Optional[bool], optional): Disable flash attention. Defaults to None. + num_experts_per_token (Optional[int], optional): Number of experts per token. Defaults to None. + cache_8bit (bool, optional): Use 8-bit cache. Defaults to False. + cache_q4 (bool, optional): Use Q4 cache. Defaults to False. + tokenizer_kwargs (dict, optional): Additional keyword arguments for the tokenizer. Defaults to {}. + gpu_split (str): \"auto\", or VRAM allocation per GPU in GB. Auto will use exllama's autosplit feature + low_mem (bool, optional): Enable VRAM optimizations, potentially trading off speed + verbose (bool, optional): Enable if you want debugging statements + + Returns: + ExLlamaV2Model: Loaded ExLlamaV2 model. + + Raises: + ImportError: If the `exllamav2` library is not installed. + """ + try: - from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config + from exllamav2 import ( # , ExLlamaV2Cache_Q4 + ExLlamaV2, + ExLlamaV2Cache, + ExLlamaV2Cache_8bit, + ExLlamaV2Config, + ) from transformers import AutoTokenizer except ImportError: raise ImportError( "The `exllamav2` library needs to be installed in order to use `exllamav2` models." ) + if os.name != "nt": + use_fasttensors = True + else: + use_fasttensors = False + + # Create config + config = ExLlamaV2Config() config.model_dir = model_path + config.fasttensors = use_fasttensors config.prepare() - config.max_seq_len = model_kwargs.pop("max_seq_len", config.max_seq_len) - config.scale_pos_emb = model_kwargs.pop("scale_pos_emb", config.scale_pos_emb) - config.scale_alpha_value = model_kwargs.pop( - "scale_alpha_value", config.scale_alpha_value - ) - config.no_flash_attn = model_kwargs.pop("no_flash_attn", config.no_flash_attn) - config.num_experts_per_token = int( - model_kwargs.pop("num_experts_per_token", config.num_experts_per_token) - ) + # Set config options + + config.max_seq_len = max_seq_len + config.scale_pos_emb = scale_pos_emb + config.scale_alpha_value = scale_alpha_value + config.no_flash_attn = no_flash_attn + if num_experts_per_token: + config.num_experts_per_token = num_experts_per_token + if low_mem: + config.set_low_mem() + + # Load the model model = ExLlamaV2(config) split = None - if "gpu_split" in model_kwargs.keys(): - split = [float(alloc) for alloc in model_kwargs["gpu_split"].split(",")] + if gpu_split and gpu_split != "auto": + split = [float(alloc) for alloc in gpu_split.split(",")] - model.load(split) + if gpu_split != "auto": + if not verbose: + print(" -- Loading model...") + model.load(split) + + # Load tokenizer + + if not verbose: + print(" -- Loading tokenizer...") + + # tokenizer = ExLlamaV2Tokenizer(config) tokenizer_kwargs.setdefault("padding_side", "left") tokenizer = AutoTokenizer.from_pretrained(model_path, **tokenizer_kwargs) + # tokenizer = TransformerTokenizer(model_path, **tokenizer_kwargs) + + # Create cache + + if cache_8bit: + cache = ExLlamaV2Cache_8bit(model, lazy=not model.loaded) + # elif cache_q4: + # cache = ExLlamaV2Cache_Q4(model, lazy = not model.loaded) + else: + cache = ExLlamaV2Cache(model, lazy=not model.loaded) + + # Load model now if auto split enabled - cache = ExLlamaV2Cache(model) + if not model.loaded: + print(" -- Loading model...") + model.load_autosplit(cache) return ExLlamaV2Model(model, tokenizer, device, cache) From aa995113329a298341a91a0098aa4485f59a1b0c Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Wed, 6 Mar 2024 20:11:16 -0600 Subject: [PATCH 14/25] Added LoRA support --- outlines/models/exllamav2.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 22a52f8cb..d38e5e83d 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -6,7 +6,7 @@ from .transformers import TransformerTokenizer if TYPE_CHECKING: - from exllamav2 import ExLlamaV2, ExLlamaV2Cache + from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora from transformers import PreTrainedTokenizer @@ -19,12 +19,14 @@ def __init__( tokenizer: "PreTrainedTokenizer", device, cache: "ExLlamaV2Cache", + lora: Optional["ExLlamaV2Lora"] = None, ): self.device = device self.model = model self.tokenizer = TransformerTokenizer(tokenizer) self.cache = cache self.past_seq = None + self.lora = lora def forward(self, input_ids: torch.LongTensor, *_): """Compute a forward pass through the exl2 model.""" @@ -51,6 +53,7 @@ def forward(self, input_ids: torch.LongTensor, *_): seq_tensor[longest_prefix:-1].view(1, -1), self.cache, preprocess_only=True, + loras=self.lora, ) elif seq_tensor.shape[0] == longest_prefix: self.cache.current_seq_len -= 1 @@ -62,11 +65,14 @@ def forward(self, input_ids: torch.LongTensor, *_): seq_tensor[:-1].view(1, -1), self.cache, preprocess_only=True, + loras=self.lora, ) self.past_seq = seq_tensor - return self.model.forward(seq_tensor[-1:].view(1, -1), self.cache) + return self.model.forward( + seq_tensor[-1:].view(1, -1), self.cache, loras=self.lora + ) def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor: logits = self.forward(input_ids) @@ -74,6 +80,11 @@ def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor: return next_token_logits, None + def update_lora(self, lora_path): + """Update and apply the LoRA to the model. Input the LoRA path""" + self.lora = ExLlamaV2Lora.from_directory(self.model, lora_path) + print(" -- Loading LoRA...") + def exl2( model_path: str, From eb0bbfbd33a9fdbc716e344eb3252f9317d5d5ea Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Wed, 6 Mar 2024 20:21:13 -0600 Subject: [PATCH 15/25] Added unloading as well --- outlines/models/exllamav2.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index d38e5e83d..3294f5b9f 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -80,10 +80,20 @@ def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor: return next_token_logits, None - def update_lora(self, lora_path): - """Update and apply the LoRA to the model. Input the LoRA path""" - self.lora = ExLlamaV2Lora.from_directory(self.model, lora_path) - print(" -- Loading LoRA...") + def update_lora(self, lora_path: Optional[str] = None): + """ + Update and apply the LoRA to the model. + + Args: + lora_path (Optional[str]): The path to the LoRA directory. If None, the LoRA will be unloaded. + """ + if lora_path is None: + if self.lora is not None: + print(" -- Unloading LoRA...") + self.lora = None + else: + self.lora = ExLlamaV2Lora.from_directory(self.model, lora_path) + print(" -- Loading LoRA...") def exl2( From b7bd216cfc28b1dece607c0b48b047599979fa7c Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Wed, 6 Mar 2024 20:53:12 -0600 Subject: [PATCH 16/25] fixed LoRA import --- outlines/models/exllamav2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 3294f5b9f..e43125c42 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -3,12 +3,12 @@ import torch -from .transformers import TransformerTokenizer - if TYPE_CHECKING: from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora from transformers import PreTrainedTokenizer +from .transformers import TransformerTokenizer + class ExLlamaV2Model: """Represents a `exl2` model.""" @@ -137,6 +137,7 @@ def exl2( """ try: + from exllamav2 import ExLlamaV2Lora # noqa: F401 from exllamav2 import ( # , ExLlamaV2Cache_Q4 ExLlamaV2, ExLlamaV2Cache, From 9b720a3df2fd537bacb438e195672354fce96b3f Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Wed, 6 Mar 2024 20:57:02 -0600 Subject: [PATCH 17/25] fixed LoRA import --- outlines/models/exllamav2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index e43125c42..6636ee699 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -4,9 +4,11 @@ import torch if TYPE_CHECKING: - from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora + from exllamav2 import ExLlamaV2, ExLlamaV2Cache from transformers import PreTrainedTokenizer +from exllamav2 import ExLlamaV2Lora + from .transformers import TransformerTokenizer @@ -137,7 +139,6 @@ def exl2( """ try: - from exllamav2 import ExLlamaV2Lora # noqa: F401 from exllamav2 import ( # , ExLlamaV2Cache_Q4 ExLlamaV2, ExLlamaV2Cache, From f7d53246bba2af98db0b69e52c5479fa6bddef8e Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Wed, 6 Mar 2024 21:06:32 -0600 Subject: [PATCH 18/25] fixed LoRA import --- outlines/models/exllamav2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 6636ee699..9bfa5d7e0 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -55,7 +55,7 @@ def forward(self, input_ids: torch.LongTensor, *_): seq_tensor[longest_prefix:-1].view(1, -1), self.cache, preprocess_only=True, - loras=self.lora, + loras=[self.lora], ) elif seq_tensor.shape[0] == longest_prefix: self.cache.current_seq_len -= 1 @@ -67,13 +67,13 @@ def forward(self, input_ids: torch.LongTensor, *_): seq_tensor[:-1].view(1, -1), self.cache, preprocess_only=True, - loras=self.lora, + loras=[self.lora], ) self.past_seq = seq_tensor return self.model.forward( - seq_tensor[-1:].view(1, -1), self.cache, loras=self.lora + seq_tensor[-1:].view(1, -1), self.cache, loras=[self.lora] ) def __call__(self, input_ids: torch.LongTensor, *_) -> torch.FloatTensor: From e62a70b4eab47fb55ef78d7c07df9cdc93ed44d3 Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Wed, 6 Mar 2024 23:11:33 -0600 Subject: [PATCH 19/25] Made max_seq_len optional again --- outlines/models/exllamav2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 9bfa5d7e0..3757acdd8 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -164,8 +164,8 @@ def exl2( config.prepare() # Set config options - - config.max_seq_len = max_seq_len + if max_seq_len is not None: + config.max_seq_len = max_seq_len config.scale_pos_emb = scale_pos_emb config.scale_alpha_value = scale_alpha_value config.no_flash_attn = no_flash_attn From 29135ee8cfbe950edb1e3b63a96ad3021e3198e8 Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Thu, 7 Mar 2024 00:17:31 -0600 Subject: [PATCH 20/25] Made remaining params optional --- outlines/models/exllamav2.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 3757acdd8..f4cef5aff 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -166,10 +166,13 @@ def exl2( # Set config options if max_seq_len is not None: config.max_seq_len = max_seq_len - config.scale_pos_emb = scale_pos_emb - config.scale_alpha_value = scale_alpha_value - config.no_flash_attn = no_flash_attn - if num_experts_per_token: + if scale_pos_emb is not None: + config.scale_pos_emb = scale_pos_emb + if scale_alpha_value is not None: + config.scale_alpha_value = scale_alpha_value + if no_flash_attn is not None: + config.no_flash_attn = no_flash_attn + if num_experts_per_token is not None: config.num_experts_per_token = num_experts_per_token if low_mem: config.set_low_mem() From ec38d6e48a48f39005f5c249cf45e1820229d095 Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Thu, 7 Mar 2024 00:26:12 -0600 Subject: [PATCH 21/25] Removed optional flag on device. Even before my changes it would crash without inputting a device. --- outlines/models/exllamav2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index f4cef5aff..6aed00f0c 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -100,7 +100,7 @@ def update_lora(self, lora_path: Optional[str] = None): def exl2( model_path: str, - device: Optional[torch.device] = None, + device: torch.device = None, max_seq_len: Optional[int] = None, scale_pos_emb: Optional[float] = None, scale_alpha_value: Optional[float] = None, @@ -118,7 +118,7 @@ def exl2( Args: model_path (str): Path to the model directory. - device (Optional[torch.device], optional): Device to load the model on. Defaults to None. + device (torch.device): Device to load the model on. Defaults to None. max_seq_len (Optional[int], optional): Maximum sequence length. Defaults to None. scale_pos_emb (Optional[float], optional): Scale factor for positional embeddings. Defaults to None. scale_alpha_value (Optional[float], optional): Scale alpha value. Defaults to None. From 8d51cb5214ae2dda6230717d85b8957d2701f6a4 Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Thu, 7 Mar 2024 00:30:34 -0600 Subject: [PATCH 22/25] Fixed type check --- outlines/models/exllamav2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 6aed00f0c..54241b3f3 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -100,7 +100,7 @@ def update_lora(self, lora_path: Optional[str] = None): def exl2( model_path: str, - device: torch.device = None, + device: str, max_seq_len: Optional[int] = None, scale_pos_emb: Optional[float] = None, scale_alpha_value: Optional[float] = None, @@ -118,7 +118,7 @@ def exl2( Args: model_path (str): Path to the model directory. - device (torch.device): Device to load the model on. Defaults to None. + device (str): Device to load the model on. Pass in 'cuda' for GPU or 'cpu' for CPU max_seq_len (Optional[int], optional): Maximum sequence length. Defaults to None. scale_pos_emb (Optional[float], optional): Scale factor for positional embeddings. Defaults to None. scale_alpha_value (Optional[float], optional): Scale alpha value. Defaults to None. From f9b6e4405c446ccd7c31789a86df426700bbfbc3 Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Thu, 7 Mar 2024 12:53:37 -0600 Subject: [PATCH 23/25] Fixed the input error --- outlines/models/exllamav2.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 54241b3f3..dd0a60513 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -4,11 +4,9 @@ import torch if TYPE_CHECKING: - from exllamav2 import ExLlamaV2, ExLlamaV2Cache + from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora from transformers import PreTrainedTokenizer -from exllamav2 import ExLlamaV2Lora - from .transformers import TransformerTokenizer @@ -89,6 +87,12 @@ def update_lora(self, lora_path: Optional[str] = None): Args: lora_path (Optional[str]): The path to the LoRA directory. If None, the LoRA will be unloaded. """ + try: + from exllamav2 import ExLlamaV2Lora + except ImportError: + raise ImportError( + "The `exllamav2` library needs to be installed in order to use `exllamav2` models." + ) if lora_path is None: if self.lora is not None: print(" -- Unloading LoRA...") From 2cf15c26ebe608851ebbede93263d41c40a76092 Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Thu, 7 Mar 2024 20:12:00 -0600 Subject: [PATCH 24/25] 4 bit cache support is now active --- outlines/models/exllamav2.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index dd0a60513..0a97223a7 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -143,10 +143,11 @@ def exl2( """ try: - from exllamav2 import ( # , ExLlamaV2Cache_Q4 + from exllamav2 import ( ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Cache_8bit, + ExLlamaV2Cache_Q4, ExLlamaV2Config, ) from transformers import AutoTokenizer @@ -209,8 +210,8 @@ def exl2( if cache_8bit: cache = ExLlamaV2Cache_8bit(model, lazy=not model.loaded) - # elif cache_q4: - # cache = ExLlamaV2Cache_Q4(model, lazy = not model.loaded) + elif cache_q4: + cache = ExLlamaV2Cache_Q4(model, lazy=not model.loaded) else: cache = ExLlamaV2Cache(model, lazy=not model.loaded) From a8ab1e65fa10b89353f163c22ce2f907f96bd2ef Mon Sep 17 00:00:00 2001 From: psych0v0yager Date: Mon, 11 Mar 2024 16:54:25 -0500 Subject: [PATCH 25/25] Made formatting changes --- outlines/models/exllamav2.py | 41 ++++++++++++++---------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 0a97223a7..dc92287ba 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -156,13 +156,19 @@ def exl2( "The `exllamav2` library needs to be installed in order to use `exllamav2` models." ) + # Load tokenizer + if not verbose: + print(" -- Loading tokenizer...") + tokenizer_kwargs.setdefault("padding_side", "left") + tokenizer = AutoTokenizer.from_pretrained(model_path, **tokenizer_kwargs) + + # Check fasttensors for config if os.name != "nt": use_fasttensors = True else: use_fasttensors = False # Create config - config = ExLlamaV2Config() config.model_dir = model_path config.fasttensors = use_fasttensors @@ -182,32 +188,10 @@ def exl2( if low_mem: config.set_low_mem() - # Load the model - + # Prepare the model from the config model = ExLlamaV2(config) - split = None - if gpu_split and gpu_split != "auto": - split = [float(alloc) for alloc in gpu_split.split(",")] - - if gpu_split != "auto": - if not verbose: - print(" -- Loading model...") - model.load(split) - - # Load tokenizer - - if not verbose: - print(" -- Loading tokenizer...") - - # tokenizer = ExLlamaV2Tokenizer(config) - - tokenizer_kwargs.setdefault("padding_side", "left") - tokenizer = AutoTokenizer.from_pretrained(model_path, **tokenizer_kwargs) - # tokenizer = TransformerTokenizer(model_path, **tokenizer_kwargs) - # Create cache - if cache_8bit: cache = ExLlamaV2Cache_8bit(model, lazy=not model.loaded) elif cache_q4: @@ -215,8 +199,15 @@ def exl2( else: cache = ExLlamaV2Cache(model, lazy=not model.loaded) - # Load model now if auto split enabled + # Load the model + split = None + if gpu_split and gpu_split != "auto": + split = [float(alloc) for alloc in gpu_split.split(",")] + if not verbose: + print(" -- Loading model...") + model.load(split) + # Autoload if no GPU split was provided if not model.loaded: print(" -- Loading model...") model.load_autosplit(cache)