From 6910b92c16eb6fed69c8ad8b4c3033f0441fc088 Mon Sep 17 00:00:00 2001 From: udayk02 Date: Thu, 2 Jan 2025 18:08:18 +0530 Subject: [PATCH 01/14] bugfix #116 --- src/chonkie/chunker/token.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py index 1912163..93cf09a 100644 --- a/src/chonkie/chunker/token.py +++ b/src/chonkie/chunker/token.py @@ -52,28 +52,27 @@ def __init__( def _create_chunks( self, - chunk_texts: List[str], token_counts: List[int], - decoded_text: str, + token_groups: List[List[int]] ) -> List[Chunk]: """Create chunks from a list of texts.""" # package everything as Chunk objects and send out the result + chunk_texts = self._decode_batch(token_groups) chunks = [] current_index = 0 - for chunk_text, token_count in zip(chunk_texts, token_counts): - start_index = decoded_text.find( - chunk_text, current_index - ) # Find needs to be run every single time because of unknown overlap length - end_index = start_index + len(chunk_text) + for chunk_text, token_count, token_group in zip(chunk_texts, token_counts, token_groups): + end_index = current_index + len(chunk_text) chunks.append( Chunk( text=chunk_text, - start_index=start_index, + start_index=current_index, end_index=end_index, token_count=token_count, ) ) - current_index = end_index + # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk + overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group)) + current_index = end_index - len("".join(self._decode_batch([token_group[-overlap_tokens:]]))) return chunks def chunk(self, text: str) -> List[Chunk]: @@ -92,9 +91,6 @@ def chunk(self, text: str) -> List[Chunk]: # Encode full text text_tokens = self._encode(text) - # We decode the text because the tokenizer might result in a different output than text - decoded_text = self._decode(text_tokens) - # Calculate chunk positions token_groups = [ text_tokens[ @@ -108,11 +104,7 @@ def chunk(self, text: str) -> List[Chunk]: len(toks) for toks in token_groups ] # get the token counts; it's prolly chunk_size, but len doesn't take too long - chunk_texts = self._decode_batch( - token_groups - ) # decrease the time by decoding in one go (?) - - chunks = self._create_chunks(chunk_texts, token_counts, decoded_text) + chunks = self._create_chunks(token_counts, token_groups) return chunks From 83940b9076103bb470f8d393ccaa85352346c468 Mon Sep 17 00:00:00 2001 From: udayk02 Date: Fri, 3 Jan 2025 12:26:52 +0530 Subject: [PATCH 02/14] update: bugfix #116 - removed the unnecessary `join` as there is only one token_group. - replaced `_decode_batch` with `_decode` --- src/chonkie/chunker/token.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py index 93cf09a..5e7f4b1 100644 --- a/src/chonkie/chunker/token.py +++ b/src/chonkie/chunker/token.py @@ -72,7 +72,7 @@ def _create_chunks( ) # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group)) - current_index = end_index - len("".join(self._decode_batch([token_group[-overlap_tokens:]]))) + current_index = end_index - len(self._decode(token_group[-overlap_tokens:])) return chunks def chunk(self, text: str) -> List[Chunk]: From 53d532de3c3abd1d7cb681e409c4d11fe7e8d5d8 Mon Sep 17 00:00:00 2001 From: udayk02 Date: Fri, 3 Jan 2025 21:49:29 +0530 Subject: [PATCH 03/14] update: bugfix #116 - `start_index` remains 0 when `chunk_overlap` is 0, fixed it. --- src/chonkie/chunker/token.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py index 5e7f4b1..8cf8e2b 100644 --- a/src/chonkie/chunker/token.py +++ b/src/chonkie/chunker/token.py @@ -70,9 +70,11 @@ def _create_chunks( token_count=token_count, ) ) + # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group)) - current_index = end_index - len(self._decode(token_group[-overlap_tokens:])) + current_index = end_index - len(self._decode(token_group[-overlap_tokens:] if overlap_tokens > 0 else [])) + return chunks def chunk(self, text: str) -> List[Chunk]: From e069fb7ad9c3d4c56d959f4cbd121d14036585b1 Mon Sep 17 00:00:00 2001 From: udayk02 Date: Sat, 4 Jan 2025 10:33:56 +0530 Subject: [PATCH 04/14] update: bugfix #116 - applies only when chunk_overlap > 0 - batch decoding for overlap texts --- src/chonkie/chunker/token.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py index 8cf8e2b..94e1d73 100644 --- a/src/chonkie/chunker/token.py +++ b/src/chonkie/chunker/token.py @@ -60,7 +60,16 @@ def _create_chunks( chunk_texts = self._decode_batch(token_groups) chunks = [] current_index = 0 - for chunk_text, token_count, token_group in zip(chunk_texts, token_counts, token_groups): + + if (self.chunk_overlap > 0): + overlap_tokens_space = [ + # we get the space taken by the overlapping text, that gives you the start_index for the next chunk + len(overlap_text) + for overlap_text in self._decode_batch([token_group[-(self.chunk_overlap - (self.chunk_size - len(token_group))):] + for token_group in token_groups]) + ] + + for i, (chunk_text, token_count) in enumerate(zip(chunk_texts, token_counts)): end_index = current_index + len(chunk_text) chunks.append( Chunk( @@ -71,9 +80,7 @@ def _create_chunks( ) ) - # we subtract the space taken by the overlapping text, that gives you the start_index for the next chunk - overlap_tokens = self.chunk_overlap - (self.chunk_size - len(token_group)) - current_index = end_index - len(self._decode(token_group[-overlap_tokens:] if overlap_tokens > 0 else [])) + current_index = end_index - (overlap_tokens_space[i] if self.chunk_overlap > 0 else 0) return chunks From 87b6306974bd46b2c79cdacafa9733ab4b8ed1c5 Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Sun, 5 Jan 2025 02:47:16 +0530 Subject: [PATCH 05/14] [fix] use proper decode batch functions in _decode_batch --- src/chonkie/chunker/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/chonkie/chunker/base.py b/src/chonkie/chunker/base.py index 693eb41..a5ef8bc 100644 --- a/src/chonkie/chunker/base.py +++ b/src/chonkie/chunker/base.py @@ -183,11 +183,11 @@ def _decode(self, tokens) -> str: def _decode_batch(self, token_lists: List[List[int]]) -> List[str]: """Decode a batch of token lists using the backend tokenizer.""" if self._tokenizer_backend == "transformers": - return [self.tokenizer.decode(tokens) for tokens in token_lists] + return self.tokenizer.batch_decode(token_lists, skip_special_tokens=True) elif self._tokenizer_backend == "tokenizers": - return [self.tokenizer.decode(tokens) for tokens in token_lists] + return self.tokenizer.decode_batch(token_lists) elif self._tokenizer_backend == "tiktoken": - return [self.tokenizer.decode(tokens) for tokens in token_lists] + return self.tokenizer.decode_batch(token_lists) elif self._tokenizer_backend == "callable": raise NotImplementedError( "Callable tokenizer backend does not support batch decoding." From 0d3069e9de7b21093cfeb26b10d6bad6a4fec651 Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Sun, 5 Jan 2025 03:19:51 +0530 Subject: [PATCH 06/14] [fix] start_index shouldn't use full_text find in batch mode --- src/chonkie/chunker/token.py | 94 ++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 46 deletions(-) diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py index 4604de4..8297133 100644 --- a/src/chonkie/chunker/token.py +++ b/src/chonkie/chunker/token.py @@ -51,35 +51,38 @@ def __init__( def _create_chunks( self, - token_counts: List[int], - token_groups: List[List[int]] + chunk_texts: List[str], + token_groups: List[List[int]], + token_counts: List[int] ) -> List[Chunk]: """Create chunks from a list of texts.""" - # package everything as Chunk objects and send out the result - chunk_texts = self._decode_batch(token_groups) + # Find the overlap lengths for index calculation + if self.chunk_overlap > 0: + # we get the overlap texts, that gives you the start_index for the next chunk + # if the token group is smaller than the overlap, we just use the whole token group + overlap_texts = self._decode_batch([token_group[-self.chunk_overlap:] + if (len(token_group) > self.chunk_overlap) + else token_group + for token_group in token_groups]) + overlap_lengths = [len(overlap_text) for overlap_text in overlap_texts] + else: + overlap_lengths = [0] * len(token_groups) + + # Create the chunks chunks = [] current_index = 0 - - if (self.chunk_overlap > 0): - overlap_tokens_space = [ - # we get the space taken by the overlapping text, that gives you the start_index for the next chunk - len(overlap_text) - for overlap_text in self._decode_batch([token_group[-(self.chunk_overlap - (self.chunk_size - len(token_group))):] - for token_group in token_groups]) - ] - - for i, (chunk_text, token_count) in enumerate(zip(chunk_texts, token_counts)): - end_index = current_index + len(chunk_text) + for chunk_text, overlap_length, token_count in zip(chunk_texts, overlap_lengths, token_counts): + start_index = current_index + end_index = start_index + len(chunk_text) chunks.append( Chunk( text=chunk_text, - start_index=current_index, + start_index=start_index, end_index=end_index, token_count=token_count, ) ) - - current_index = end_index - (overlap_tokens_space[i] if self.chunk_overlap > 0 else 0) + current_index = end_index - overlap_length return chunks @@ -100,32 +103,23 @@ def chunk(self, text: str) -> List[Chunk]: text_tokens = self._encode(text) # Calculate chunk positions - token_groups = [ - text_tokens[ - start_index : min(start_index + self.chunk_size, len(text_tokens)) - ] - for start_index in range( - 0, len(text_tokens), self.chunk_size - self.chunk_overlap - ) - ] - token_counts = [ - len(toks) for toks in token_groups - ] # get the token counts; it's prolly chunk_size, but len doesn't take too long + token_groups = [text_tokens[start_index : min(start_index + self.chunk_size, len(text_tokens))] + for start_index in range(0, len(text_tokens), self.chunk_size - self.chunk_overlap)] + token_counts = [len(toks) for toks in token_groups] + + # decode the token groups into the chunk texts + chunk_texts = self._decode_batch(token_groups) - chunks = self._create_chunks(token_counts, token_groups) + # Create the chunks from the token groups and token counts + chunks = self._create_chunks(chunk_texts, token_groups, token_counts) return chunks - def _chunk_generator( - self, tokens: List[int] - ) -> Generator[Tuple[List[int], int, int], None, None]: + def _token_group_generator(self, tokens: List[int]) -> Generator[List[int]]: """Generate chunks from a list of tokens.""" - stride = self.chunk_size - self.chunk_overlap - for start in range(0, len(tokens), stride): + for start in range(0, len(tokens), self.chunk_size - self.chunk_overlap): end = min(start + self.chunk_size, len(tokens)) - yield tokens[start:end], start, end - if end == len(tokens): - break + yield tokens[start:end] def _process_batch(self, chunks: List[Tuple[List[int], int, int]], @@ -149,22 +143,28 @@ def _process_batch(self, def _process_text_batch(self, texts: List[str]) -> List[List[Chunk]]: """Process a batch of texts.""" + # encode the texts into tokens in a batch tokens_list = self._encode_batch(texts) - decoded_texts = self._decode_batch(tokens_list) result = [] - for tokens, text in zip(tokens_list, decoded_texts): + for tokens in tokens_list: if not tokens: result.append([]) continue - chunks = [] - chunk_batch = [] + # get the token groups + token_groups = [] + for token_group in self._token_group_generator(tokens): + token_groups.append(token_group) + + # get the token counts + token_counts = [len(token_group) for token_group in token_groups] - for chunk_data in self._chunk_generator(tokens): - chunk_batch.append(chunk_data) + # decode the token groups into the chunk texts + chunk_texts = self._decode_batch(token_groups) - chunks.extend(self._process_batch(chunk_batch, text)) + # create the chunks from the token groups and token counts + chunks = self._create_chunks(chunk_texts, token_groups, token_counts) result.append(chunks) return result @@ -182,6 +182,7 @@ def chunk_batch( List of lists of Chunk objects containing the chunked text and metadata """ + # if batch_size is not None, we process the texts in mini-batches to avoid memory issues if batch_size is not None: chunks = [] for i in range(0, len(texts), batch_size): @@ -194,6 +195,7 @@ def chunk_batch( def __repr__(self) -> str: """Return a string representation of the TokenChunker.""" return ( - f"TokenChunker(chunk_size={self.chunk_size}, " + f"TokenChunker(tokenizer={self.tokenizer}, " + f"chunk_size={self.chunk_size}, " f"chunk_overlap={self.chunk_overlap})" ) From a9d5eaa9e69268ef69a42889f26d80e879be6e55 Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Sun, 5 Jan 2025 03:32:59 +0530 Subject: [PATCH 07/14] use tiktoken for most tests --- tests/chunker/test_token_chunker.py | 34 ++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/chunker/test_token_chunker.py b/tests/chunker/test_token_chunker.py index b990bcc..f0c9b5e 100644 --- a/tests/chunker/test_token_chunker.py +++ b/tests/chunker/test_token_chunker.py @@ -152,9 +152,9 @@ def test_token_chunker_initialization_tik(tiktokenizer): assert chunker.chunk_overlap == 128 -def test_token_chunker_chunking(tokenizer, sample_text): +def test_token_chunker_chunking(tiktokenizer, sample_text): """Test that the TokenChunker can chunk a sample text into tokens.""" - chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128) + chunker = TokenChunker(tokenizer=tiktokenizer, chunk_size=512, chunk_overlap=128) chunks = chunker.chunk(sample_text) assert len(chunks) > 0 @@ -196,9 +196,9 @@ def test_token_chunker_chunking_tik(tiktokenizer, sample_text): assert all([chunk.end_index is not None for chunk in chunks]) -def test_token_chunker_empty_text(tokenizer): +def test_token_chunker_empty_text(tiktokenizer): """Test that the TokenChunker can handle empty text input.""" - chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128) + chunker = TokenChunker(tokenizer=tiktokenizer, chunk_size=512, chunk_overlap=128) chunks = chunker.chunk("") assert len(chunks) == 0 @@ -246,9 +246,9 @@ def test_token_chunker_single_chunk_text(tokenizer): assert chunks[0].text == "Hello, how are you?" -def test_token_chunker_batch_chunking(tokenizer, sample_batch): +def test_token_chunker_batch_chunking(tiktokenizer, sample_batch): """Test that the TokenChunker can chunk a batch of texts into tokens.""" - chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128) + chunker = TokenChunker(tokenizer=tiktokenizer, chunk_size=512, chunk_overlap=128) chunks = chunker.chunk_batch(sample_batch) assert len(chunks) > 0 @@ -267,16 +267,16 @@ def test_token_chunker_batch_chunking(tokenizer, sample_batch): ) -def test_token_chunker_repr(tokenizer): +def test_token_chunker_repr(tiktokenizer): """Test that the TokenChunker has a string representation.""" - chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128) + chunker = TokenChunker(tokenizer=tiktokenizer, chunk_size=512, chunk_overlap=128) - assert repr(chunker) == "TokenChunker(chunk_size=512, chunk_overlap=128)" + assert repr(chunker) == "TokenChunker(tokenizer=, chunk_size=512, chunk_overlap=128)" -def test_token_chunker_call(tokenizer, sample_text): +def test_token_chunker_call(tiktokenizer, sample_text): """Test that the TokenChunker can be called directly.""" - chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128) + chunker = TokenChunker(tokenizer=tiktokenizer, chunk_size=512, chunk_overlap=128) chunks = chunker(sample_text) assert len(chunks) > 0 @@ -305,7 +305,7 @@ def verify_chunk_indices(chunks: List[Chunk], original_text: str): ) -def test_token_chunker_indices(sample_text): +def test_token_chunker_indices(tiktokenizer, sample_text): """Test that TokenChunker's indices correctly map to original text.""" tokenizer = Tokenizer.from_pretrained("gpt2") chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128) @@ -321,19 +321,19 @@ def test_token_chunker_indices_complex_md(sample_complex_markdown_text): verify_chunk_indices(chunks, sample_complex_markdown_text) -def test_token_chunker_token_counts(tokenizer, sample_text): +def test_token_chunker_token_counts(tiktokenizer, sample_text): """Test that the TokenChunker correctly calculates token counts.""" - chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128) + chunker = TokenChunker(tokenizer=tiktokenizer, chunk_size=512, chunk_overlap=128) chunks = chunker.chunk(sample_text) assert all([chunk.token_count > 0 for chunk in chunks]), "All chunks must have a positive token count" assert all([chunk.token_count <= 512 for chunk in chunks]), "All chunks must have a token count less than or equal to 512" - token_counts = [len(tokenizer.encode(chunk.text)) for chunk in chunks] + token_counts = [len(tiktokenizer.encode(chunk.text)) for chunk in chunks] assert all([chunk.token_count == token_count for chunk, token_count in zip(chunks, token_counts)]), "All chunks must have a token count equal to the length of the encoded text" -def test_token_chunker_indices_batch(tokenizer, sample_text): +def test_token_chunker_indices_batch(tiktokenizer, sample_text): """Test that TokenChunker's indices correctly map to original text.""" - chunker = TokenChunker(tokenizer=tokenizer, chunk_size=512, chunk_overlap=128) + chunker = TokenChunker(tokenizer=tiktokenizer, chunk_size=512, chunk_overlap=128) chunks = chunker.chunk_batch([sample_text]*10)[-1] verify_chunk_indices(chunks, sample_text) From 09e26a3f7ece76c2c7c52d3b767558696c9d5897 Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Sun, 5 Jan 2025 03:33:35 +0530 Subject: [PATCH 08/14] [minor] fix the generator type syntax --- src/chonkie/chunker/token.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py index 8297133..61bf1a3 100644 --- a/src/chonkie/chunker/token.py +++ b/src/chonkie/chunker/token.py @@ -115,7 +115,7 @@ def chunk(self, text: str) -> List[Chunk]: return chunks - def _token_group_generator(self, tokens: List[int]) -> Generator[List[int]]: + def _token_group_generator(self, tokens: List[int]) -> Generator[List[int], None, None]: """Generate chunks from a list of tokens.""" for start in range(0, len(tokens), self.chunk_size - self.chunk_overlap): end = min(start + self.chunk_size, len(tokens)) From bf20aa3c9b3b296e61816d302fe4c68b23c8e2fa Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Sun, 5 Jan 2025 03:46:25 +0530 Subject: [PATCH 09/14] Update Python version requirement in pyproject.toml to support Python 3.8 and add corresponding classifier --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 76bf283..a774626 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "chonkie" version = "0.4.0" description = "🦛 CHONK your texts with Chonkie ✨ - The no-nonsense RAG chunking library" readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.8" license = {file = "LICENSE"} keywords = ["chunking", "rag", "nlp", "text-processing"] authors = [ @@ -17,6 +17,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Programming Language :: Python", "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8" "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -38,6 +39,7 @@ dependencies = [ [project.urls] Homepage = "https://github.com/bhavnicksm/chonkie" Documentation = "https://docs.chonkie.ai" + [project.optional-dependencies] model2vec = ["model2vec>=0.1.0", "numpy>=1.23.0, <2.2"] st = ["sentence-transformers>=3.0.0", "numpy>=1.23.0, <2.2"] From 62aa5a95dc8df20e839a5f09fd06c1f845aac33e Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Sun, 5 Jan 2025 03:48:40 +0530 Subject: [PATCH 10/14] =?UTF-8?q?[fix]=20minor=20syntax=20fix=20=E2=80=94?= =?UTF-8?q?=20missing=20comma?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a774626..7d3ab31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8" + "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", From d443199e05bfb82bd9f1e3d3c58910bfbf6ccd5c Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Sun, 5 Jan 2025 03:54:32 +0530 Subject: [PATCH 11/14] =?UTF-8?q?Revert=20"[fix]=20minor=20syntax=20fix=20?= =?UTF-8?q?=E2=80=94=20missing=20comma"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 62aa5a95dc8df20e839a5f09fd06c1f845aac33e. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7d3ab31..a774626 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.8" "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", From e3aa2d2de2d81f7e4cb173d7caaae0cebbaf4f16 Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Sun, 5 Jan 2025 03:56:14 +0530 Subject: [PATCH 12/14] =?UTF-8?q?Revert=20"[fix]=20minor=20syntax=20fix=20?= =?UTF-8?q?=E2=80=94=20missing=20comma"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 62aa5a95dc8df20e839a5f09fd06c1f845aac33e. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a774626..954a1cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,4 +65,4 @@ packages = ["chonkie", "chonkie.refinery"] [tool.ruff] -select = ["F", "I", "D", "DOC"] +select = ["F", "I", "D", "DOC"] \ No newline at end of file From 37009bcfbc93d75001056f0398e37977aba1a681 Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Sun, 5 Jan 2025 03:57:15 +0530 Subject: [PATCH 13/14] Bump up the version for "model2vec" --- pyproject.toml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 954a1cb..286c740 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "chonkie" version = "0.4.0" description = "🦛 CHONK your texts with Chonkie ✨ - The no-nonsense RAG chunking library" readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.9" license = {file = "LICENSE"} keywords = ["chunking", "rag", "nlp", "text-processing"] authors = [ @@ -17,7 +17,6 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8" "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -41,11 +40,11 @@ Homepage = "https://github.com/bhavnicksm/chonkie" Documentation = "https://docs.chonkie.ai" [project.optional-dependencies] -model2vec = ["model2vec>=0.1.0", "numpy>=1.23.0, <2.2"] +model2vec = ["model2vec>=0.3.0", "numpy>=1.23.0, <2.2"] st = ["sentence-transformers>=3.0.0", "numpy>=1.23.0, <2.2"] openai = ["openai>=1.0.0", "numpy>=1.23.0, <2.2"] -semantic = ["model2vec>=0.1.0", "numpy>=1.23.0, <2.2"] -all = ["sentence-transformers>=3.0.0", "numpy>=1.23.0, <2.2", "openai>=1.0.0", "model2vec>=0.1.0"] +semantic = ["model2vec>=0.3.0", "numpy>=1.23.0, <2.2"] +all = ["sentence-transformers>=3.0.0", "numpy>=1.23.0, <2.2", "openai>=1.0.0", "model2vec>=0.3.0"] dev = [ "pytest>=6.2.0", "pytest-cov>=4.0.0", From 5e8a27f7b46a3ae3a5503ea573cd45055939ddcd Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Sun, 5 Jan 2025 04:19:08 +0530 Subject: [PATCH 14/14] Remove tests for Py3.8 --- .github/workflows/python-test-push.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-test-push.yml b/.github/workflows/python-test-push.yml index 2717793..4b91312 100644 --- a/.github/workflows/python-test-push.yml +++ b/.github/workflows/python-test-push.yml @@ -8,7 +8,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4