From 6647502e21714328038c63b41c48f3275dfbdf86 Mon Sep 17 00:00:00 2001 From: Shreyash Nigam Date: Sun, 5 Jan 2025 18:49:14 +0530 Subject: [PATCH 1/5] Update readme intro to match docs. --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b511731..57e3fb0 100644 --- a/README.md +++ b/README.md @@ -23,11 +23,13 @@ _The no-nonsense RAG chunking library that's lightweight, lightning-fast, and re -so i found myself making another RAG bot (for the 2342148th time) and meanwhile, explaining to my juniors about why we should use chunking in our RAG bots, only to realise that i would have to write chunking all over again unless i use the bloated software library X or the extremely feature-less library Y. _WHY CAN I NOT HAVE SOMETHING JUST RIGHT, UGH?_ +Ever found yourself making a RAG bot yet again (your 2,342,148th one), +only to realize you’re stuck having to write chunking with bloated software library X or the painfully feature-less library Y? +_WHY CAN'T THIS JUST BE SIMPLE, UGH?_ -Can't i just install, import and run chunking and not have to worry about dependencies, bloat, speed or other factors? +What if all you had to do was install, import and run chunking? -Well, with chonkie you can! (chonkie boi is a gud boi) +Well, look no further than Chonkie! (chonkie boi is a gud boi) **🚀 Feature-rich**: All the CHONKs you'd ever need
**✨ Easy to use**: Install, Import, CHONK
From f7a7097a80f6701f1c24d17027e32ee5dd9a06fd Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Sun, 5 Jan 2025 22:59:27 +0530 Subject: [PATCH 2/5] [fix] High `chunk_overlap` causes last chunk to be entirely redundant --- src/chonkie/chunker/token.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py index 61bf1a3..ec6cc2b 100644 --- a/src/chonkie/chunker/token.py +++ b/src/chonkie/chunker/token.py @@ -85,6 +85,14 @@ def _create_chunks( current_index = end_index - overlap_length return chunks + + def _token_group_generator(self, tokens: List[int]) -> Generator[List[int], None, None]: + """Generate chunks from a list of tokens.""" + for start in range(0, len(tokens), self.chunk_size - self.chunk_overlap): + end = min(start + self.chunk_size, len(tokens)) + yield tokens[start:end] + if end == len(tokens): + break def chunk(self, text: str) -> List[Chunk]: """Split text into overlapping chunks of specified token size. @@ -102,9 +110,8 @@ def chunk(self, text: str) -> List[Chunk]: # Encode full text text_tokens = self._encode(text) - # Calculate chunk positions - token_groups = [text_tokens[start_index : min(start_index + self.chunk_size, len(text_tokens))] - for start_index in range(0, len(text_tokens), self.chunk_size - self.chunk_overlap)] + # Calculate token groups and counts + token_groups = list(self._token_group_generator(text_tokens)) token_counts = [len(toks) for toks in token_groups] # decode the token groups into the chunk texts @@ -115,12 +122,6 @@ def chunk(self, text: str) -> List[Chunk]: return chunks - def _token_group_generator(self, tokens: List[int]) -> Generator[List[int], None, None]: - """Generate chunks from a list of tokens.""" - for start in range(0, len(tokens), self.chunk_size - self.chunk_overlap): - end = min(start + self.chunk_size, len(tokens)) - yield tokens[start:end] - def _process_batch(self, chunks: List[Tuple[List[int], int, int]], full_text: str) -> List[Chunk]: @@ -153,9 +154,7 @@ def _process_text_batch(self, texts: List[str]) -> List[List[Chunk]]: continue # get the token groups - token_groups = [] - for token_group in self._token_group_generator(tokens): - token_groups.append(token_group) + token_groups = list(self._token_group_generator(tokens)) # get the token counts token_counts = [len(token_group) for token_group in token_groups] From c29b6ab5aa2adeb3f3d0bdc3b8e21ed87b458670 Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Mon, 6 Jan 2025 04:46:59 +0530 Subject: [PATCH 3/5] [FIX] Handle edge case for RecursiveChunker (#131) --- src/chonkie/chunker/recursive.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/chonkie/chunker/recursive.py b/src/chonkie/chunker/recursive.py index c390596..7bc6437 100644 --- a/src/chonkie/chunker/recursive.py +++ b/src/chonkie/chunker/recursive.py @@ -53,9 +53,13 @@ def _split_text(self, # Usually a good idea to check if there are any splits that are too short in characters # and then merge them merged_splits = [] - for split in splits: + for i, split in enumerate(splits): if len(split) < self.min_characters_per_chunk: - merged_splits[-1] += split + if merged_splits: + merged_splits[-1] += split + else: + splits[i+1] = split + splits[i+1] # When merge splits is empty, we merge the current split with the next split + continue else: merged_splits.append(split) splits = merged_splits From 1927c7290653fea345189002fe9f1f36daf8e056 Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Mon, 6 Jan 2025 05:11:40 +0530 Subject: [PATCH 4/5] Enhance README.md introduction for clarity and engagement - Revised the introductory text to make it more relatable and engaging for users. - Added humorous comparisons between existing libraries and the simplicity of using Chonkie. - Emphasized the ease of use with a catchy phrase: "just CHONK it!" --- README.md | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 57e3fb0..f49e0fb 100644 --- a/README.md +++ b/README.md @@ -23,13 +23,19 @@ _The no-nonsense RAG chunking library that's lightweight, lightning-fast, and re -Ever found yourself making a RAG bot yet again (your 2,342,148th one), -only to realize you’re stuck having to write chunking with bloated software library X or the painfully feature-less library Y? -_WHY CAN'T THIS JUST BE SIMPLE, UGH?_ +Ever found yourself building yet another RAG bot (your 2,342,148th, but who's counting?), only to hit that all-too-familiar wall? You know the one - where you're stuck choosing between: -What if all you had to do was install, import and run chunking? +- Library X: A behemoth that takes forever to install and probably includes three different kitchen sinks +- Library Y: So bare-bones it might as well be a "Hello World" program +- Writing it yourself? For the 2,342,149th time... sigh -Well, look no further than Chonkie! (chonkie boi is a gud boi) +And you think to yourself: + +> "WHY CAN'T THIS JUST BE SIMPLE?!"
+> "Why do I need to choose between bloated and bare-bones?"
+> "Why can't I just install, import, and CHONK?!"
+ +Well, look no further than Chonkie! (a chonkie boi is a gud boi 🦛💕) **🚀 Feature-rich**: All the CHONKs you'd ever need
**✨ Easy to use**: Install, Import, CHONK
@@ -39,7 +45,7 @@ Well, look no further than Chonkie! (chonkie boi is a gud boi) **🦛 Cute CHONK mascot**: psst it's a pygmy hippo btw
**❤️ [Moto Moto](#acknowledgements)'s favorite python library**
-What're you waiting for, **just CHONK it**! +**Chonkie** is a chunking library that "**just works™**". So what're you waiting for, **just CHONK it**! # Installation From 3e3bef7b7f88c498d0dec589b5f38cf66fb2fbfb Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Mon, 6 Jan 2025 05:13:04 +0530 Subject: [PATCH 5/5] [DOCS] Add one to intro line --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f49e0fb..58a486a 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ _The no-nonsense RAG chunking library that's lightweight, lightning-fast, and re -Ever found yourself building yet another RAG bot (your 2,342,148th, but who's counting?), only to hit that all-too-familiar wall? You know the one - where you're stuck choosing between: +Ever found yourself building yet another RAG bot (your 2,342,148th one, but who's counting?), only to hit that all-too-familiar wall? You know the one - where you're stuck choosing between: - Library X: A behemoth that takes forever to install and probably includes three different kitchen sinks - Library Y: So bare-bones it might as well be a "Hello World" program