From 0836a9317f82c15b9948cae06aea841b024109e4 Mon Sep 17 00:00:00 2001
From: DocShotgun <126566557+DocShotgun@users.noreply.github.com>
Date: Sat, 23 Nov 2024 10:27:37 -0800
Subject: [PATCH 1/7] Grammar: Initial Formatron regex and JSON schema
 implementation * Replace LMFE's regex and JSON schema filters with
 Formatron's * Remove Outlines EBNF filter in preparation for Formatron KBNF
 filter * TODO: Implement Formatron KBNF filter

---
 backends/exllamav2/grammar.py | 156 ++++++++--------------------------
 backends/exllamav2/model.py   |   2 +-
 pyproject.toml                |   3 +-
 3 files changed, 39 insertions(+), 122 deletions(-)

diff --git a/backends/exllamav2/grammar.py b/backends/exllamav2/grammar.py
index 3ad2f44a..9c274915 100644
--- a/backends/exllamav2/grammar.py
+++ b/backends/exllamav2/grammar.py
@@ -1,110 +1,20 @@
 import traceback
 from exllamav2 import ExLlamaV2, ExLlamaV2Tokenizer
-from exllamav2.generator.filters import ExLlamaV2Filter, ExLlamaV2PrefixFilter
-from lmformatenforcer import (
-    JsonSchemaParser,
-    RegexParser,
-    TokenEnforcer,
-    CharacterLevelParser,
-)
-from lmformatenforcer.integrations.exllamav2 import (
-    build_token_enforcer_tokenizer_data,
-)
+from exllamav2.generator.filters import ExLlamaV2Filter
 from loguru import logger
 from typing import List
-from functools import lru_cache
 
-
-class OutlinesTokenizerWrapper:
-    """Wrapper for Outlines tokenizer"""
-
-    def __init__(self, tokenizer):
-        self.tokenizer = tokenizer
-        id_to_piece = self.tokenizer.get_id_to_piece_list()
-        self.vocabulary = {piece: idx for idx, piece in enumerate(id_to_piece)}
-        self.eos_token_id = self.tokenizer.eos_token_id
-        self.eos_token = id_to_piece[self.tokenizer.eos_token_id]
-        self.special_tokens = list(self.tokenizer.extended_id_to_piece.keys())
-
-    def convert_token_to_string(self, token):
-        return token
-
-    def decode(self, tokens):
-        s = ""
-        id_to_piece = self.tokenizer.get_id_to_piece_list()
-        for t in tokens:
-            s += id_to_piece[t]
-        return s
-
-
-class ExLlamaV2EbnfFilter(ExLlamaV2Filter):
-    """Filter class for context-free grammar via outlines"""
-
-    def __init__(self, model, tokenizer, grammar):
-        from outlines.fsm.fsm import CFGFSM
-
-        super().__init__(model, tokenizer)
-
-        self.wrapped_tokenizer = OutlinesTokenizerWrapper(tokenizer)
-        self.fsm = CFGFSM(grammar, self.wrapped_tokenizer)
-        self.state = self.fsm.first_state
-
-    def begin(self, prefix_str=""):
-        self.state = self.fsm.first_state
-
-    def feed(self, token):
-        self.state = self.fsm.next_state(self.state, token.item())
-
-    def next(self):
-        return self.fsm.allowed_token_ids(self.state), set()
-
-    def use_background_worker(self):
-        return True
-
-
-@lru_cache(10)
-def _get_lmfe_tokenizer_data(tokenizer: ExLlamaV2Tokenizer):
-    return build_token_enforcer_tokenizer_data(tokenizer)
-
-
-class ExLlamaV2TokenEnforcerFilter(ExLlamaV2Filter):
-    """Filter class for LMFE"""
-
-    token_sequence: List[int]
-
-    def __init__(
-        self,
-        model: ExLlamaV2,
-        tokenizer: ExLlamaV2Tokenizer,
-        character_level_parser: CharacterLevelParser,
-    ):
-        super().__init__(model, tokenizer)
-        tokenizer_data = _get_lmfe_tokenizer_data(tokenizer)
-        self.token_enforcer = TokenEnforcer(tokenizer_data, character_level_parser)
-        self.token_sequence = []
-
-    def begin(self, prefix_str: str):
-        self.token_sequence = []
-
-    def feed(self, token):
-        self.token_sequence.append(int(token[0][0]))
-
-    def next(self):
-        allowed_tokens = self.token_enforcer.get_allowed_tokens(self.token_sequence)
-        if not hasattr(self, "allow_return_type_list"):
-            return set(allowed_tokens), set()
-        else:
-            return sorted(allowed_tokens), []
-
-    def use_background_worker(self):
-        return True
+from formatron.formatter import FormatterBuilder
+from formatron.schemas import json_schema
+from formatron.integrations.exllamav2 import create_formatter_filter
 
 
 def clear_grammar_func_cache():
     """Flush tokenizer_data cache to avoid holding references to
     tokenizers after unloading a model"""
 
-    _get_lmfe_tokenizer_data.cache_clear()
+    # TODO: Unsure if this is needed with formatron
+    pass
 
 
 class ExLlamaV2Grammar:
@@ -117,7 +27,7 @@ def __init__(self):
 
     def add_json_schema_filter(
         self,
-        json_schema: dict,
+        schema: dict,
         model: ExLlamaV2,
         tokenizer: ExLlamaV2Tokenizer,
     ):
@@ -125,7 +35,16 @@ def add_json_schema_filter(
 
         # Create the parser
         try:
-            schema_parser = JsonSchemaParser(json_schema)
+            # Add fields required by formatron if not present
+            if "$id" not in schema:
+                schema["$id"] = "https://example.com/example.json"
+            if "$schema" not in schema:
+                schema["$schema"] = "http://json-schema.org/draft-07/schema#"
+
+            # Validate schema and create formatter
+            schema = json_schema.create_schema(schema)
+            f = FormatterBuilder()
+            f.append_line(f"{f.json(schema)}")
         except Exception:
             traceback.print_exc()
             logger.error(
@@ -135,14 +54,10 @@ def add_json_schema_filter(
 
             return
 
-        # Allow JSON objects or JSON arrays at the top level
-        json_prefixes = ["[", "{"]
-
-        lmfilter = ExLlamaV2TokenEnforcerFilter(model, tokenizer, schema_parser)
-        prefix_filter = ExLlamaV2PrefixFilter(model, tokenizer, json_prefixes)
+        lmfilter = create_formatter_filter(model, tokenizer, f)
 
         # Append the filters
-        self.filters.extend([lmfilter, prefix_filter])
+        self.filters.append(lmfilter)
 
     def add_regex_filter(
         self,
@@ -154,7 +69,9 @@ def add_regex_filter(
 
         # Create the parser
         try:
-            pattern_parser = RegexParser(pattern)
+            # Validate regex and create formatter
+            f = FormatterBuilder()
+            f.append_line(f"{f.regex(pattern)}")
         except Exception:
             traceback.print_exc()
             logger.error(
@@ -164,32 +81,33 @@ def add_regex_filter(
 
             return
 
-        lmfilter = ExLlamaV2TokenEnforcerFilter(model, tokenizer, pattern_parser)
+        lmfilter = create_formatter_filter(model, tokenizer, f)
 
         # Append the filters
         self.filters.append(lmfilter)
 
-    def add_ebnf_filter(
+    def add_kbnf_filter(
         self,
-        ebnf_string: str,
+        kbnf_string: str,
         model: ExLlamaV2,
         tokenizer: ExLlamaV2Tokenizer,
     ):
-        """
-        Add an EBNF grammar filter.
-        Possibly replace outlines with an in-house solution in the future.
-        """
+        """Adds an ExllamaV2 filter based on KBNF grammar."""
 
+        # Create the parser
         try:
-            ebnf_filter = ExLlamaV2EbnfFilter(model, tokenizer, ebnf_string)
-        except ImportError:
+            # Validate KBNF and create formatter
+            f = FormatterBuilder()
+            # TODO: Implement this
+        except Exception:
             logger.error(
-                "Skipping EBNF parsing because Outlines is not installed.\n"
-                "Please run the following command in your environment "
-                "to install extra packages:\n"
-                "pip install -U .[extras]"
+                "Skipping because the KBNF string couldn't be parsed. "
+                "Please read the above error for more information."
             )
 
             return
 
-        self.filters.append(ebnf_filter)
+        lmfilter = create_formatter_filter(model, tokenizer, f)
+
+        # Append the filters
+        self.filters.append(lmfilter)
diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index ff11531a..50cef42c 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -1194,7 +1194,7 @@ async def generate_gen(
         # Add EBNF filter if it exists
         grammar_string = unwrap(kwargs.get("grammar_string"))
         if grammar_string:
-            grammar_handler.add_ebnf_filter(grammar_string, self.model, self.tokenizer)
+            grammar_handler.add_kbnf_filter(grammar_string, self.model, self.tokenizer)
 
         # Set banned strings
         banned_strings: List[str] = unwrap(kwargs.get("banned_strings"), [])
diff --git a/pyproject.toml b/pyproject.toml
index de782b73..efa1d760 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ dependencies = [
     "sse-starlette",
     "packaging",
     "tokenizers",
-    "lm-format-enforcer >= 0.9.6",
+    "formatron",
     "aiofiles",
     "aiohttp",
     "async_lru",
@@ -53,7 +53,6 @@ dependencies = [
 [project.optional-dependencies]
 extras = [
     # Heavy dependencies that aren't for everyday use
-    "outlines",
     "infinity-emb",
     "sentence-transformers",
 ]

From a9f39bcff36f934577d963904b9274dbdf984b92 Mon Sep 17 00:00:00 2001
From: DocShotgun <126566557+DocShotgun@users.noreply.github.com>
Date: Sat, 23 Nov 2024 12:05:41 -0800
Subject: [PATCH 2/7] Grammar: Preliminary Formatron KBNF support

---
 backends/exllamav2/grammar.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/backends/exllamav2/grammar.py b/backends/exllamav2/grammar.py
index 9c274915..9621e3f8 100644
--- a/backends/exllamav2/grammar.py
+++ b/backends/exllamav2/grammar.py
@@ -1,4 +1,5 @@
 import traceback
+import typing
 from exllamav2 import ExLlamaV2, ExLlamaV2Tokenizer
 from exllamav2.generator.filters import ExLlamaV2Filter
 from loguru import logger
@@ -7,6 +8,7 @@
 from formatron.formatter import FormatterBuilder
 from formatron.schemas import json_schema
 from formatron.integrations.exllamav2 import create_formatter_filter
+from formatron.extractor import NonterminalExtractor
 
 
 def clear_grammar_func_cache():
@@ -98,7 +100,11 @@ def add_kbnf_filter(
         try:
             # Validate KBNF and create formatter
             f = FormatterBuilder()
-            # TODO: Implement this
+            f.append_line(
+                f"{f.extractor(
+                    lambda nonterminal: CustomExtractor(nonterminal, kbnf_string)
+                    )}"
+            )
         except Exception:
             logger.error(
                 "Skipping because the KBNF string couldn't be parsed. "
@@ -111,3 +117,19 @@ def add_kbnf_filter(
 
         # Append the filters
         self.filters.append(lmfilter)
+
+
+class CustomExtractor(NonterminalExtractor):
+    def __init__(self, nonterminal: str, kbnf_string: str):
+        super().__init__(nonterminal)
+        self.kbnf_string = kbnf_string
+
+    # Fails without an extract function defined
+    # No idea what it does or why it's needed, but this seems to work
+    # TODO: Figure out how to do this properly
+    def extract(self, input_str: str) -> typing.Optional[tuple[str, typing.Any]]:
+        return input_str[len(input_str) :], input_str[: len(input_str)]
+
+    @property
+    def kbnf_definition(self) -> str:
+        return self.kbnf_string.replace("start", self.nonterminal)

From 8f209efb99c0aa06f23098e0a47ffc3216fc1a64 Mon Sep 17 00:00:00 2001
From: DocShotgun <126566557+DocShotgun@users.noreply.github.com>
Date: Sun, 24 Nov 2024 10:44:45 -0800
Subject: [PATCH 3/7] Grammar: Clean up KBNF implementation * Also remove empty
 cache clear function

---
 backends/exllamav2/grammar.py | 30 +++++++++++-------------------
 backends/exllamav2/model.py   |  3 ---
 2 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/backends/exllamav2/grammar.py b/backends/exllamav2/grammar.py
index 9621e3f8..9a6b5204 100644
--- a/backends/exllamav2/grammar.py
+++ b/backends/exllamav2/grammar.py
@@ -1,22 +1,14 @@
 import traceback
 import typing
-from exllamav2 import ExLlamaV2, ExLlamaV2Tokenizer
-from exllamav2.generator.filters import ExLlamaV2Filter
-from loguru import logger
 from typing import List
 
+from exllamav2 import ExLlamaV2, ExLlamaV2Tokenizer
+from exllamav2.generator.filters import ExLlamaV2Filter
+from formatron.extractor import NonterminalExtractor
 from formatron.formatter import FormatterBuilder
-from formatron.schemas import json_schema
 from formatron.integrations.exllamav2 import create_formatter_filter
-from formatron.extractor import NonterminalExtractor
-
-
-def clear_grammar_func_cache():
-    """Flush tokenizer_data cache to avoid holding references to
-    tokenizers after unloading a model"""
-
-    # TODO: Unsure if this is needed with formatron
-    pass
+from formatron.schemas import json_schema
+from loguru import logger
 
 
 class ExLlamaV2Grammar:
@@ -102,7 +94,7 @@ def add_kbnf_filter(
             f = FormatterBuilder()
             f.append_line(
                 f"{f.extractor(
-                    lambda nonterminal: CustomExtractor(nonterminal, kbnf_string)
+                    lambda nonterminal: CFGExtractor(nonterminal, kbnf_string)
                     )}"
             )
         except Exception:
@@ -119,16 +111,16 @@ def add_kbnf_filter(
         self.filters.append(lmfilter)
 
 
-class CustomExtractor(NonterminalExtractor):
+class CFGExtractor(NonterminalExtractor):
+    """Extractor class for KBNF context-free grammar"""
+
     def __init__(self, nonterminal: str, kbnf_string: str):
         super().__init__(nonterminal)
         self.kbnf_string = kbnf_string
 
-    # Fails without an extract function defined
-    # No idea what it does or why it's needed, but this seems to work
-    # TODO: Figure out how to do this properly
+    # Return the entire input string as the extracted string
     def extract(self, input_str: str) -> typing.Optional[tuple[str, typing.Any]]:
-        return input_str[len(input_str) :], input_str[: len(input_str)]
+        return "", input_str
 
     @property
     def kbnf_definition(self) -> str:
diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 50cef42c..64ed5b90 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -833,9 +833,6 @@ async def unload(self, loras_only: bool = False, **kwargs):
                 # Wait for other jobs to finish
                 await self.wait_for_jobs(kwargs.get("skip_wait"))
 
-            # Delete references held in the grammar module
-            clear_grammar_func_cache()
-
             # Clear the image embedding cache
             clear_image_embedding_cache()
 

From 6f2dc2ea99bcb9b430d27ec08047353fc0c5f3cd Mon Sep 17 00:00:00 2001
From: DocShotgun <126566557+DocShotgun@users.noreply.github.com>
Date: Sun, 24 Nov 2024 11:35:45 -0800
Subject: [PATCH 4/7] Grammar: Fix syntax, lint

---
 backends/exllamav2/grammar.py | 5 ++---
 backends/exllamav2/model.py   | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/backends/exllamav2/grammar.py b/backends/exllamav2/grammar.py
index 9a6b5204..62f50a31 100644
--- a/backends/exllamav2/grammar.py
+++ b/backends/exllamav2/grammar.py
@@ -93,9 +93,8 @@ def add_kbnf_filter(
             # Validate KBNF and create formatter
             f = FormatterBuilder()
             f.append_line(
-                f"{f.extractor(
-                    lambda nonterminal: CFGExtractor(nonterminal, kbnf_string)
-                    )}"
+                f"""{f.extractor(lambda nonterminal:
+                    CFGExtractor(nonterminal, kbnf_string))}"""
             )
         except Exception:
             logger.error(
diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 64ed5b90..7a5324f3 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -39,7 +39,6 @@
 
 from backends.exllamav2.grammar import (
     ExLlamaV2Grammar,
-    clear_grammar_func_cache,
 )
 from backends.exllamav2.utils import (
     exllama_disabled_flash_attn,

From 3c4211c963494618efc5253049444f588ddd118a Mon Sep 17 00:00:00 2001
From: kingbri <8082010+bdashore3@users.noreply.github.com>
Date: Mon, 2 Dec 2024 15:10:20 -0500
Subject: [PATCH 5/7] Dependencies: Ensure updated kbnf

Signed-off-by: kingbri <8082010+bdashore3@users.noreply.github.com>
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index efa1d760..b27f4532 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,6 +27,7 @@ dependencies = [
     "packaging",
     "tokenizers",
     "formatron",
+    "kbnf>=0.4.1",
     "aiofiles",
     "aiohttp",
     "async_lru",

From 7f899734c012ed51b776be767f4f89736109afdf Mon Sep 17 00:00:00 2001
From: DocShotgun <126566557+DocShotgun@users.noreply.github.com>
Date: Thu, 5 Dec 2024 21:36:37 -0800
Subject: [PATCH 6/7] Grammar: Cache the engine vocabulary * Avoid rebuilding
 the KBNF engine vocabulary on every grammar-enabled request

---
 backends/exllamav2/grammar.py | 40 +++++++++++++++++++++++++++++++----
 backends/exllamav2/model.py   |  4 ++++
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/backends/exllamav2/grammar.py b/backends/exllamav2/grammar.py
index 62f50a31..47c5ed58 100644
--- a/backends/exllamav2/grammar.py
+++ b/backends/exllamav2/grammar.py
@@ -1,12 +1,14 @@
 import traceback
 import typing
+from functools import lru_cache
 from typing import List
 
+import torch
 from exllamav2 import ExLlamaV2, ExLlamaV2Tokenizer
 from exllamav2.generator.filters import ExLlamaV2Filter
 from formatron.extractor import NonterminalExtractor
 from formatron.formatter import FormatterBuilder
-from formatron.integrations.exllamav2 import create_formatter_filter
+from formatron.integrations.exllamav2 import FormatterFilter, create_engine_vocabulary
 from formatron.schemas import json_schema
 from loguru import logger
 
@@ -48,7 +50,7 @@ def add_json_schema_filter(
 
             return
 
-        lmfilter = create_formatter_filter(model, tokenizer, f)
+        lmfilter = _create_formatter_filter(model, tokenizer, f)
 
         # Append the filters
         self.filters.append(lmfilter)
@@ -75,7 +77,7 @@ def add_regex_filter(
 
             return
 
-        lmfilter = create_formatter_filter(model, tokenizer, f)
+        lmfilter = _create_formatter_filter(model, tokenizer, f)
 
         # Append the filters
         self.filters.append(lmfilter)
@@ -104,7 +106,7 @@ def add_kbnf_filter(
 
             return
 
-        lmfilter = create_formatter_filter(model, tokenizer, f)
+        lmfilter = _create_formatter_filter(model, tokenizer, f)
 
         # Append the filters
         self.filters.append(lmfilter)
@@ -124,3 +126,33 @@ def extract(self, input_str: str) -> typing.Optional[tuple[str, typing.Any]]:
     @property
     def kbnf_definition(self) -> str:
         return self.kbnf_string.replace("start", self.nonterminal)
+
+
+@lru_cache(1)
+def _create_cached_engine_vocabulary(tokenizer: ExLlamaV2Tokenizer):
+    """Build and cache engine vocabulary on first grammar run"""
+
+    return create_engine_vocabulary(tokenizer)
+
+
+def _create_formatter_filter(
+    model: ExLlamaV2, tokenizer: ExLlamaV2Tokenizer, formatter_builder: FormatterBuilder
+) -> ExLlamaV2Filter:
+    """
+    Create a formatter filter for the ExLlamaV2 engine.
+    Minimalist clone of formatron.integrations.exllamav2.create_formatter_filter
+    with lru_cache enabled for engine vocabulary
+    """
+
+    vocab = _create_cached_engine_vocabulary(tokenizer)
+    f = formatter_builder.build(
+        vocab, lambda tokens: tokenizer.decode(torch.tensor(tokens))
+    )
+    return FormatterFilter(model, tokenizer, f)
+
+
+def clear_grammar_func_cache():
+    """Flush tokenizer_data cache to avoid holding references to
+    tokenizers after unloading a model"""
+
+    _create_cached_engine_vocabulary.cache_clear()
diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 7a5324f3..50cef42c 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -39,6 +39,7 @@
 
 from backends.exllamav2.grammar import (
     ExLlamaV2Grammar,
+    clear_grammar_func_cache,
 )
 from backends.exllamav2.utils import (
     exllama_disabled_flash_attn,
@@ -832,6 +833,9 @@ async def unload(self, loras_only: bool = False, **kwargs):
                 # Wait for other jobs to finish
                 await self.wait_for_jobs(kwargs.get("skip_wait"))
 
+            # Delete references held in the grammar module
+            clear_grammar_func_cache()
+
             # Clear the image embedding cache
             clear_image_embedding_cache()
 

From f25ac4b833efb1b364e73aba75da87e293601344 Mon Sep 17 00:00:00 2001
From: kingbri <8082010+bdashore3@users.noreply.github.com>
Date: Wed, 11 Dec 2024 21:58:25 -0500
Subject: [PATCH 7/7] Dependencies: Update ExllamaV2

v0.2.6

Signed-off-by: kingbri <8082010+bdashore3@users.noreply.github.com>
---
 pyproject.toml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6f23bd0e..021b6a9e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,12 +70,12 @@ cu121 = [
     "torch @ https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+cu121.torch2.5.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+cu121.torch2.5.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+cu121.torch2.5.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+cu121.torch2.5.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+cu121.torch2.5.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+cu121.torch2.5.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+cu121.torch2.5.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 
     # Windows FA2 from https://github.com/bdashore3/flash-attention/releases
     "flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu124torch2.5.1cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
@@ -99,9 +99,9 @@ amd = [
     "torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
 
     # Exl2
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.5/exllamav2-0.2.5+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.6/exllamav2-0.2.6+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
 ]
 
 # MARK: Ruff options