From 6910b92c16eb6fed69c8ad8b4c3033f0441fc088 Mon Sep 17 00:00:00 2001
From: udayk02 <udaykirankarusodi@gmail.com>
Date: Thu, 2 Jan 2025 18:08:18 +0530
Subject: [PATCH 01/14] bugfix #116

---
 src/chonkie/chunker/token.py | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
index 1912163..93cf09a 100644
--- a/src/chonkie/chunker/token.py
+++ b/src/chonkie/chunker/token.py
@@ -52,28 +52,27 @@ def __init__(
     
     def _create_chunks(
         self,
-        chunk_texts: List[str],
         token_counts: List[int],
-        decoded_text: str,
+        token_groups: List[List[int]]
     ) -> List[Chunk]:
         """Create chunks from a list of texts."""
         # package everything as Chunk objects and send out the result
+        chunk_texts = self._decode_batch(token_groups)
         chunks = []
         current_index = 0
-        for chunk_text, token_count in zip(chunk_texts, token_counts):
-            start_index = decoded_text.find(
-                chunk_text, current_index
-            )  # Find needs to be run every single time because of unknown overlap length
-            end_index = start_index + len(chunk_text)
+        for chunk_text, token_count, token_group in zip(chunk_texts, token_counts, token_groups):
+            end_index = current_index + len(chunk_text)
             chunks.append(
                 Chunk(
                     text=chunk_text,
-                    start_index=start_index,
+                    start_index=current_index,
                     end_index=end_index,
                     token_count=token_count,
                 )
             )
-            current_index = end_index
+            # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk
+            overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group))
+            current_index = end_index - len("".join(self._decode_batch([token_group[-overlap_tokens:]])))
         return chunks
 
     def chunk(self, text: str) -> List[Chunk]:
@@ -92,9 +91,6 @@ def chunk(self, text: str) -> List[Chunk]:
         # Encode full text
         text_tokens = self._encode(text)
 
-        # We decode the text because the tokenizer might result in a different output than text
-        decoded_text = self._decode(text_tokens)
-
         # Calculate chunk positions
         token_groups = [
             text_tokens[
@@ -108,11 +104,7 @@ def chunk(self, text: str) -> List[Chunk]:
             len(toks) for toks in token_groups
         ]  # get the token counts; it's prolly chunk_size, but len doesn't take too long
 
-        chunk_texts = self._decode_batch(
-            token_groups
-        )  # decrease the time by decoding in one go (?)
-
-        chunks = self._create_chunks(chunk_texts, token_counts, decoded_text)
+        chunks = self._create_chunks(token_counts, token_groups)
 
         return chunks
 

From 83940b9076103bb470f8d393ccaa85352346c468 Mon Sep 17 00:00:00 2001
From: udayk02 <udaykirankarusodi@gmail.com>
Date: Fri, 3 Jan 2025 12:26:52 +0530
Subject: [PATCH 02/14] update: bugfix #116

- removed the unnecessary `join` as there is only one token_group.
- replaced `_decode_batch` with `_decode`
---
 src/chonkie/chunker/token.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
index 93cf09a..5e7f4b1 100644
--- a/src/chonkie/chunker/token.py
+++ b/src/chonkie/chunker/token.py
@@ -72,7 +72,7 @@ def _create_chunks(
             )
             # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk
             overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group))
-            current_index = end_index - len("".join(self._decode_batch([token_group[-overlap_tokens:]])))
+            current_index = end_index - len(self._decode(token_group[-overlap_tokens:]))
         return chunks
 
     def chunk(self, text: str) -> List[Chunk]:

From 53d532de3c3abd1d7cb681e409c4d11fe7e8d5d8 Mon Sep 17 00:00:00 2001
From: udayk02 <udaykirankarusodi@gmail.com>
Date: Fri, 3 Jan 2025 21:49:29 +0530
Subject: [PATCH 03/14] update: bugfix #116 - `start_index` remains 0 when
 `chunk_overlap` is 0, fixed it.

---
 src/chonkie/chunker/token.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
index 5e7f4b1..8cf8e2b 100644
--- a/src/chonkie/chunker/token.py
+++ b/src/chonkie/chunker/token.py
@@ -70,9 +70,11 @@ def _create_chunks(
                     token_count=token_count,
                 )
             )
+            
             # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk
             overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group))
-            current_index = end_index - len(self._decode(token_group[-overlap_tokens:]))
+            current_index = end_index - len(self._decode(token_group[-overlap_tokens:] if overlap_tokens > 0 else []))
+        
         return chunks
 
     def chunk(self, text: str) -> List[Chunk]:

From e069fb7ad9c3d4c56d959f4cbd121d14036585b1 Mon Sep 17 00:00:00 2001
From: udayk02 <udaykirankarusodi@gmail.com>
Date: Sat, 4 Jan 2025 10:33:56 +0530
Subject: [PATCH 04/14] update: bugfix #116 - applies only when chunk_overlap >
 0 - batch decoding for overlap texts

---
 src/chonkie/chunker/token.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
index 8cf8e2b..94e1d73 100644
--- a/src/chonkie/chunker/token.py
+++ b/src/chonkie/chunker/token.py
@@ -60,7 +60,16 @@ def _create_chunks(
         chunk_texts = self._decode_batch(token_groups)
         chunks = []
         current_index = 0
-        for chunk_text, token_count, token_group in zip(chunk_texts, token_counts, token_groups):
+
+        if (self.chunk_overlap > 0):
+            overlap_tokens_space = [
+                # we get the space taken by the overlapping text, that gives you the start_index for the next chunk
+                len(overlap_text)
+                for overlap_text in self._decode_batch([token_group[-(self.chunk_overlap - (self.chunk_size - len(token_group))):]
+                                                        for token_group in token_groups])
+            ]
+
+        for i, (chunk_text, token_count) in enumerate(zip(chunk_texts, token_counts)):
             end_index = current_index + len(chunk_text)
             chunks.append(
                 Chunk(
@@ -71,9 +80,7 @@ def _create_chunks(
                 )
             )
             
-            # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk
-            overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group))
-            current_index = end_index - len(self._decode(token_group[-overlap_tokens:] if overlap_tokens > 0 else []))
+            current_index = end_index - (overlap_tokens_space[i] if self.chunk_overlap > 0 else 0)
         
         return chunks
 

From 87b6306974bd46b2c79cdacafa9733ab4b8ed1c5 Mon Sep 17 00:00:00 2001
From: bhavnicksm <bhavnicksm@gmail.com>
Date: Sun, 5 Jan 2025 02:47:16 +0530
Subject: [PATCH 05/14] [fix] use proper decode batch functions in
 _decode_batch

---
 src/chonkie/chunker/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/chonkie/chunker/base.py b/src/chonkie/chunker/base.py
index 693eb41..a5ef8bc 100644
--- a/src/chonkie/chunker/base.py
+++ b/src/chonkie/chunker/base.py
@@ -183,11 +183,11 @@ def _decode(self, tokens) -> str:
     def _decode_batch(self, token_lists: List[List[int]]) -> List[str]:
         """Decode a batch of token lists using the backend tokenizer."""
         if self._tokenizer_backend == "transformers":
-            return [self.tokenizer.decode(tokens) for tokens in token_lists]
+            return self.tokenizer.batch_decode(token_lists, skip_special_tokens=True)
         elif self._tokenizer_backend == "tokenizers":
-            return [self.tokenizer.decode(tokens) for tokens in token_lists]
+            return self.tokenizer.decode_batch(token_lists)
         elif self._tokenizer_backend == "tiktoken":
-            return [self.tokenizer.decode(tokens) for tokens in token_lists]
+            return self.tokenizer.decode_batch(token_lists)
         elif self._tokenizer_backend == "callable":
             raise NotImplementedError(
                 "Callable tokenizer backend does not support batch decoding."

From 0d3069e9de7b21093cfeb26b10d6bad6a4fec651 Mon Sep 17 00:00:00 2001
From: bhavnicksm <bhavnicksm@gmail.com>
Date: Sun, 5 Jan 2025 03:19:51 +0530
Subject: [PATCH 06/14] [fix] start_index shouldn't use full_text find in batch
 mode

---
 src/chonkie/chunker/token.py | 94 ++++++++++++++++++------------------
 1 file changed, 48 insertions(+), 46 deletions(-)

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
index 4604de4..8297133 100644
--- a/src/chonkie/chunker/token.py
+++ b/src/chonkie/chunker/token.py
@@ -51,35 +51,38 @@ def __init__(
     
     def _create_chunks(
         self,
-        token_counts: List[int],
-        token_groups: List[List[int]]
+        chunk_texts: List[str],
+        token_groups: List[List[int]],
+        token_counts: List[int]
     ) -> List[Chunk]:
         """Create chunks from a list of texts."""
-        # package everything as Chunk objects and send out the result
-        chunk_texts = self._decode_batch(token_groups)
+        # Find the overlap lengths for index calculation
+        if self.chunk_overlap > 0:
+            # we get the overlap texts, that gives you the start_index for the next chunk
+            # if the token group is smaller than the overlap, we just use the whole token group
+            overlap_texts = self._decode_batch([token_group[-self.chunk_overlap:] 
+                                                    if (len(token_group) > self.chunk_overlap)
+                                                    else token_group
+                                                    for token_group in token_groups])
+            overlap_lengths = [len(overlap_text) for overlap_text in overlap_texts]
+        else:
+            overlap_lengths = [0] * len(token_groups)
+        
+        # Create the chunks
         chunks = []
         current_index = 0
-
-        if (self.chunk_overlap > 0):
-            overlap_tokens_space = [
-                # we get the space taken by the overlapping text, that gives you the start_index for the next chunk
-                len(overlap_text)
-                for overlap_text in self._decode_batch([token_group[-(self.chunk_overlap - (self.chunk_size - len(token_group))):]
-                                                        for token_group in token_groups])
-            ]
-
-        for i, (chunk_text, token_count) in enumerate(zip(chunk_texts, token_counts)):
-            end_index = current_index + len(chunk_text)
+        for chunk_text, overlap_length, token_count in zip(chunk_texts, overlap_lengths, token_counts):
+            start_index = current_index
+            end_index = start_index + len(chunk_text)
             chunks.append(
                 Chunk(
                     text=chunk_text,
-                    start_index=current_index,
+                    start_index=start_index,
                     end_index=end_index,
                     token_count=token_count,
                 )
             )
-            
-            current_index = end_index - (overlap_tokens_space[i] if self.chunk_overlap > 0 else 0)
+            current_index = end_index - overlap_length
         
         return chunks
 
@@ -100,32 +103,23 @@ def chunk(self, text: str) -> List[Chunk]:
         text_tokens = self._encode(text)
 
         # Calculate chunk positions
-        token_groups = [
-            text_tokens[
-                start_index : min(start_index + self.chunk_size, len(text_tokens))
-            ]
-            for start_index in range(
-                0, len(text_tokens), self.chunk_size - self.chunk_overlap
-            )
-        ]
-        token_counts = [
-            len(toks) for toks in token_groups
-        ]  # get the token counts; it's prolly chunk_size, but len doesn't take too long
+        token_groups = [text_tokens[start_index : min(start_index + self.chunk_size, len(text_tokens))]
+            for start_index in range(0, len(text_tokens), self.chunk_size - self.chunk_overlap)]
+        token_counts = [len(toks) for toks in token_groups]
+
+        # decode the token groups into the chunk texts
+        chunk_texts = self._decode_batch(token_groups) 
 
-        chunks = self._create_chunks(token_counts, token_groups)
+        # Create the chunks from the token groups and token counts
+        chunks = self._create_chunks(chunk_texts, token_groups, token_counts)
 
         return chunks
 
-    def _chunk_generator(
-        self, tokens: List[int]
-    ) -> Generator[Tuple[List[int], int, int], None, None]:
+    def _token_group_generator(self, tokens: List[int]) -> Generator[List[int]]:
         """Generate chunks from a list of tokens."""
-        stride = self.chunk_size - self.chunk_overlap
-        for start in range(0, len(tokens), stride):
+        for start in range(0, len(tokens), self.chunk_size - self.chunk_overlap):
             end = min(start + self.chunk_size, len(tokens))
-            yield tokens[start:end], start, end
-            if end == len(tokens):
-                break
+            yield tokens[start:end]
 
     def _process_batch(self,
                        chunks: List[Tuple[List[int], int, int]],
@@ -149,22 +143,28 @@ def _process_batch(self,
 
     def _process_text_batch(self, texts: List[str]) -> List[List[Chunk]]:
         """Process a batch of texts."""
+        # encode the texts into tokens in a batch
         tokens_list = self._encode_batch(texts)
-        decoded_texts = self._decode_batch(tokens_list)
         result = []
 
-        for tokens, text in zip(tokens_list, decoded_texts):
+        for tokens in tokens_list:
             if not tokens:
                 result.append([])
                 continue
 
-            chunks = []
-            chunk_batch = []
+            # get the token groups
+            token_groups = []
+            for token_group in self._token_group_generator(tokens):
+                token_groups.append(token_group)
+            
+            # get the token counts
+            token_counts = [len(token_group) for token_group in token_groups]
 
-            for chunk_data in self._chunk_generator(tokens):
-                chunk_batch.append(chunk_data)
+            # decode the token groups into the chunk texts
+            chunk_texts = self._decode_batch(token_groups)
 
-            chunks.extend(self._process_batch(chunk_batch, text))
+            # create the chunks from the token groups and token counts
+            chunks = self._create_chunks(chunk_texts, token_groups, token_counts)
             result.append(chunks)
 
         return result
@@ -182,6 +182,7 @@ def chunk_batch(
             List of lists of Chunk objects containing the chunked text and metadata
 
         """
+        # if batch_size is not None, we process the texts in mini-batches to avoid memory issues
         if batch_size is not None:
             chunks = []
             for i in range(0, len(texts), batch_size):
@@ -194,6 +195,7 @@ def chunk_batch(
     def __repr__(self) -> str:
         """Return a string representation of the TokenChunker."""
         return (
-            f"TokenChunker(chunk_size={self.chunk_size}, "
+            f"TokenChunker(tokenizer={self.tokenizer}, "
+            f"chunk_size={self.chunk_size}, "
             f"chunk_overlap={self.chunk_overlap})"
         )

From a9d5eaa9e69268ef69a42889f26d80e879be6e55 Mon Sep 17 00:00:00 2001
From: bhavnicksm <bhavnicksm@gmail.com>
Date: Sun, 5 Jan 2025 03:32:59 +0530
Subject: [PATCH 07/14] use tiktoken for most tests

---
 tests/chunker/test_token_chunker.py | 34 ++++++++++++++---------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/tests/chunker/test_token_chunker.py b/tests/chunker/test_token_chunker.py
index b990bcc..f0c9b5e 100644
--- a/tests/chunker/test_token_chunker.py
+++ b/tests/chunker/test_token_chunker.py
@@ -152,9 +152,9 @@ def test_token_chunker_initialization_tik(tiktokenizer):
     assert chunker.chunk_overlap == 128
 
 
-def test_token_chunker_chunking(tokenizer, sample_text):
+def test_token_chunker_chunking(tiktokenizer, sample_text):
     """Test that the TokenChunker can chunk a sample text into tokens."""
-    chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128)
+    chunker = TokenChunker(tokenizer=tiktokenizer, chunk_size=512, chunk_overlap=128)
     chunks = chunker.chunk(sample_text)
 
     assert len(chunks) > 0
@@ -196,9 +196,9 @@ def test_token_chunker_chunking_tik(tiktokenizer, sample_text):
     assert all([chunk.end_index is not None for chunk in chunks])
 
 
-def test_token_chunker_empty_text(tokenizer):
+def test_token_chunker_empty_text(tiktokenizer):
     """Test that the TokenChunker can handle empty text input."""
-    chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128)
+    chunker = TokenChunker(tokenizer=tiktokenizer, chunk_size=512, chunk_overlap=128)
     chunks = chunker.chunk("")
 
     assert len(chunks) == 0
@@ -246,9 +246,9 @@ def test_token_chunker_single_chunk_text(tokenizer):
     assert chunks[0].text == "Hello, how are you?"
 
 
-def test_token_chunker_batch_chunking(tokenizer, sample_batch):
+def test_token_chunker_batch_chunking(tiktokenizer, sample_batch):
     """Test that the TokenChunker can chunk a batch of texts into tokens."""
-    chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128)
+    chunker = TokenChunker(tokenizer=tiktokenizer, chunk_size=512, chunk_overlap=128)
     chunks = chunker.chunk_batch(sample_batch)
 
     assert len(chunks) > 0
@@ -267,16 +267,16 @@ def test_token_chunker_batch_chunking(tokenizer, sample_batch):
     )
 
 
-def test_token_chunker_repr(tokenizer):
+def test_token_chunker_repr(tiktokenizer):
     """Test that the TokenChunker has a string representation."""
-    chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128)
+    chunker = TokenChunker(tokenizer=tiktokenizer, chunk_size=512, chunk_overlap=128)
 
-    assert repr(chunker) == "TokenChunker(chunk_size=512, chunk_overlap=128)"
+    assert repr(chunker) == "TokenChunker(tokenizer=<Encoding 'gpt2'>, chunk_size=512, chunk_overlap=128)"
 
 
-def test_token_chunker_call(tokenizer, sample_text):
+def test_token_chunker_call(tiktokenizer, sample_text):
     """Test that the TokenChunker can be called directly."""
-    chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128)
+    chunker = TokenChunker(tokenizer=tiktokenizer, chunk_size=512, chunk_overlap=128)
     chunks = chunker(sample_text)
 
     assert len(chunks) > 0
@@ -305,7 +305,7 @@ def verify_chunk_indices(chunks: List[Chunk], original_text: str):
         )
 
 
-def test_token_chunker_indices(sample_text):
+def test_token_chunker_indices(tiktokenizer, sample_text):
     """Test that TokenChunker's indices correctly map to original text."""
     tokenizer = Tokenizer.from_pretrained("gpt2")
     chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128)
@@ -321,19 +321,19 @@ def test_token_chunker_indices_complex_md(sample_complex_markdown_text):
     verify_chunk_indices(chunks, sample_complex_markdown_text)
 
 
-def test_token_chunker_token_counts(tokenizer, sample_text):
+def test_token_chunker_token_counts(tiktokenizer, sample_text):
     """Test that the TokenChunker correctly calculates token counts."""
-    chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128)
+    chunker = TokenChunker(tokenizer=tiktokenizer, chunk_size=512, chunk_overlap=128)
     chunks = chunker.chunk(sample_text)
     assert all([chunk.token_count > 0 for chunk in chunks]), "All chunks must have a positive token count"
     assert all([chunk.token_count <= 512 for chunk in chunks]), "All chunks must have a token count less than or equal to 512"  
 
-    token_counts = [len(tokenizer.encode(chunk.text)) for chunk in chunks]
+    token_counts = [len(tiktokenizer.encode(chunk.text)) for chunk in chunks]
     assert all([chunk.token_count == token_count for chunk, token_count in zip(chunks, token_counts)]), "All chunks must have a token count equal to the length of the encoded text"
 
-def test_token_chunker_indices_batch(tokenizer, sample_text):
+def test_token_chunker_indices_batch(tiktokenizer, sample_text):
     """Test that TokenChunker's indices correctly map to original text."""
-    chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128)
+    chunker = TokenChunker(tokenizer=tiktokenizer, chunk_size=512, chunk_overlap=128)
     chunks = chunker.chunk_batch([sample_text]*10)[-1]
     verify_chunk_indices(chunks, sample_text)
 

From 09e26a3f7ece76c2c7c52d3b767558696c9d5897 Mon Sep 17 00:00:00 2001
From: bhavnicksm <bhavnicksm@gmail.com>
Date: Sun, 5 Jan 2025 03:33:35 +0530
Subject: [PATCH 08/14] [minor] fix the generator type syntax

---
 src/chonkie/chunker/token.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
index 8297133..61bf1a3 100644
--- a/src/chonkie/chunker/token.py
+++ b/src/chonkie/chunker/token.py
@@ -115,7 +115,7 @@ def chunk(self, text: str) -> List[Chunk]:
 
         return chunks
 
-    def _token_group_generator(self, tokens: List[int]) -> Generator[List[int]]:
+    def _token_group_generator(self, tokens: List[int]) -> Generator[List[int], None, None]:
         """Generate chunks from a list of tokens."""
         for start in range(0, len(tokens), self.chunk_size - self.chunk_overlap):
             end = min(start + self.chunk_size, len(tokens))

From bf20aa3c9b3b296e61816d302fe4c68b23c8e2fa Mon Sep 17 00:00:00 2001
From: bhavnicksm <bhavnicksm@gmail.com>
Date: Sun, 5 Jan 2025 03:46:25 +0530
Subject: [PATCH 09/14] Update Python version requirement in pyproject.toml to
 support Python 3.8 and add corresponding classifier

---
 pyproject.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 76bf283..a774626 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "chonkie"
 version = "0.4.0"
 description = "🦛 CHONK your texts with Chonkie ✨ - The no-nonsense RAG chunking library"
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.8"
 license = {file = "LICENSE"}
 keywords = ["chunking", "rag", "nlp", "text-processing"]
 authors = [
@@ -17,6 +17,7 @@ classifiers = [
     "License :: OSI Approved :: MIT License",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8"
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
@@ -38,6 +39,7 @@ dependencies = [
 [project.urls]
 Homepage = "https://github.com/bhavnicksm/chonkie"
 Documentation = "https://docs.chonkie.ai"
+
 [project.optional-dependencies]
 model2vec = ["model2vec>=0.1.0", "numpy>=1.23.0, <2.2"]
 st = ["sentence-transformers>=3.0.0", "numpy>=1.23.0, <2.2"]

From 62aa5a95dc8df20e839a5f09fd06c1f845aac33e Mon Sep 17 00:00:00 2001
From: bhavnicksm <bhavnicksm@gmail.com>
Date: Sun, 5 Jan 2025 03:48:40 +0530
Subject: [PATCH 10/14] =?UTF-8?q?[fix]=20minor=20syntax=20fix=20=E2=80=94?=
 =?UTF-8?q?=20missing=20comma?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index a774626..7d3ab31 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ classifiers = [
     "License :: OSI Approved :: MIT License",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8"
+    "Programming Language :: Python :: 3.8", 
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",

From d443199e05bfb82bd9f1e3d3c58910bfbf6ccd5c Mon Sep 17 00:00:00 2001
From: bhavnicksm <bhavnicksm@gmail.com>
Date: Sun, 5 Jan 2025 03:54:32 +0530
Subject: [PATCH 11/14] =?UTF-8?q?Revert=20"[fix]=20minor=20syntax=20fix=20?=
 =?UTF-8?q?=E2=80=94=20missing=20comma"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 62aa5a95dc8df20e839a5f09fd06c1f845aac33e.
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7d3ab31..a774626 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ classifiers = [
     "License :: OSI Approved :: MIT License",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8", 
+    "Programming Language :: Python :: 3.8"
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",

From e3aa2d2de2d81f7e4cb173d7caaae0cebbaf4f16 Mon Sep 17 00:00:00 2001
From: bhavnicksm <bhavnicksm@gmail.com>
Date: Sun, 5 Jan 2025 03:56:14 +0530
Subject: [PATCH 12/14] =?UTF-8?q?Revert=20"[fix]=20minor=20syntax=20fix=20?=
 =?UTF-8?q?=E2=80=94=20missing=20comma"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 62aa5a95dc8df20e839a5f09fd06c1f845aac33e.
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index a774626..954a1cb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,4 +65,4 @@ packages = ["chonkie",
             "chonkie.refinery"]
 
 [tool.ruff]
-select = ["F", "I", "D", "DOC"]
+select = ["F", "I", "D", "DOC"]
\ No newline at end of file

From 37009bcfbc93d75001056f0398e37977aba1a681 Mon Sep 17 00:00:00 2001
From: bhavnicksm <bhavnicksm@gmail.com>
Date: Sun, 5 Jan 2025 03:57:15 +0530
Subject: [PATCH 13/14] Bump up the version for "model2vec"

---
 pyproject.toml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 954a1cb..286c740 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "chonkie"
 version = "0.4.0"
 description = "🦛 CHONK your texts with Chonkie ✨ - The no-nonsense RAG chunking library"
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 license = {file = "LICENSE"}
 keywords = ["chunking", "rag", "nlp", "text-processing"]
 authors = [
@@ -17,7 +17,6 @@ classifiers = [
     "License :: OSI Approved :: MIT License",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8"
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
@@ -41,11 +40,11 @@ Homepage = "https://github.com/bhavnicksm/chonkie"
 Documentation = "https://docs.chonkie.ai"
 
 [project.optional-dependencies]
-model2vec = ["model2vec>=0.1.0", "numpy>=1.23.0, <2.2"]
+model2vec = ["model2vec>=0.3.0", "numpy>=1.23.0, <2.2"]
 st = ["sentence-transformers>=3.0.0", "numpy>=1.23.0, <2.2"]
 openai = ["openai>=1.0.0", "numpy>=1.23.0, <2.2"]
-semantic = ["model2vec>=0.1.0", "numpy>=1.23.0, <2.2"]
-all = ["sentence-transformers>=3.0.0", "numpy>=1.23.0, <2.2", "openai>=1.0.0", "model2vec>=0.1.0"]
+semantic = ["model2vec>=0.3.0", "numpy>=1.23.0, <2.2"]
+all = ["sentence-transformers>=3.0.0", "numpy>=1.23.0, <2.2", "openai>=1.0.0", "model2vec>=0.3.0"]
 dev = [
     "pytest>=6.2.0", 
     "pytest-cov>=4.0.0",

From 5e8a27f7b46a3ae3a5503ea573cd45055939ddcd Mon Sep 17 00:00:00 2001
From: bhavnicksm <bhavnicksm@gmail.com>
Date: Sun, 5 Jan 2025 04:19:08 +0530
Subject: [PATCH 14/14] Remove tests for Py3.8

---
 .github/workflows/python-test-push.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-test-push.yml b/.github/workflows/python-test-push.yml
index 2717793..4b91312 100644
--- a/.github/workflows/python-test-push.yml
+++ b/.github/workflows/python-test-push.yml
@@ -8,7 +8,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
 
     steps:
       - uses: actions/checkout@v4