From 37cc701137165b2f3e8d4302ec6cb29c55d2b6a9 Mon Sep 17 00:00:00 2001
From: DocShotgun <126566557+DocShotgun@users.noreply.github.com>
Date: Fri, 15 Nov 2024 20:35:18 -0800
Subject: [PATCH] Model: Enforce chunk_size as multiple of 256

---
 backends/exllamav2/model.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index c7d2069..dc68b22 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -321,9 +321,20 @@ async def create(cls, model_directory: pathlib.Path, quiet=False, **kwargs):
         if num_experts_override:
             self.config.num_experts_per_token = kwargs.get("num_experts_per_token")
 
-        # Make sure chunk size is >= 16 and <= max seq length
+        # Make sure chunk size is >= 256, keep near or below max seq len
         user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048)
-        chunk_size = sorted((16, user_chunk_size, self.config.max_seq_len))[1]
+        chunk_size = sorted((256, user_chunk_size, self.config.max_seq_len))[1]
+        chunk_remainder = chunk_size % 256
+        if chunk_remainder != 0:
+            rounded_chunk_size = int(256 * ((chunk_size - chunk_remainder) / 256 + 1))
+
+            logger.warning(
+                f"The given chunk size ({chunk_size}) is "
+                "not a multiple of 256.\n"
+                "Overriding chunk_size with an overestimated value of "
+                f"{rounded_chunk_size} tokens."
+            )
+            chunk_size = rounded_chunk_size
         self.config.max_input_len = chunk_size
         self.config.max_attention_size = chunk_size**2