-
Notifications
You must be signed in to change notification settings - Fork 103
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: use external chunker [cog-1354] (#551)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Introduced a modular content chunking interface that offers flexible text segmentation with configurable chunk size and overlap. - Added new chunkers for enhanced text processing, including `LangchainChunker` and improved `TextChunker`. - **Refactor** - Unified the chunk extraction mechanism across various document types for improved consistency and type safety. - Updated method signatures to enhance clarity and type safety regarding chunker usage. - Enhanced error handling and logging during text segmentation to guide adjustments when content exceeds limits. - **Bug Fixes** - Adjusted expected output in tests to reflect changes in chunking logic and configurations. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
- Loading branch information
1 parent
eba1515
commit a61df96
Showing
11 changed files
with
91 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
class Chunker: | ||
def __init__(self, document, get_text: callable, max_chunk_tokens: int, chunk_size: int = 1024): | ||
self.chunk_index = 0 | ||
self.chunk_size = 0 | ||
self.token_count = 0 | ||
|
||
self.document = document | ||
self.max_chunk_size = chunk_size | ||
self.get_text = get_text | ||
self.max_chunk_tokens = max_chunk_tokens | ||
|
||
def read(self): | ||
raise NotImplementedError |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import logging | ||
from uuid import NAMESPACE_OID, uuid5 | ||
|
||
from cognee.modules.chunking.Chunker import Chunker | ||
from .models.DocumentChunk import DocumentChunk | ||
from langchain_text_splitters import RecursiveCharacterTextSplitter | ||
from cognee.infrastructure.databases.vector import get_vector_engine | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class LangchainChunker(Chunker): | ||
""" | ||
A Chunker that splits text into chunks using Langchain's RecursiveCharacterTextSplitter. | ||
The chunker will split the text into chunks of approximately the given size, but will not split | ||
a chunk if the split would result in a chunk with fewer than the given overlap tokens. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
document, | ||
get_text: callable, | ||
max_chunk_tokens: int, | ||
chunk_size: int = 1024, | ||
chunk_overlap=10, | ||
): | ||
super().__init__(document, get_text, max_chunk_tokens, chunk_size) | ||
|
||
self.splitter = RecursiveCharacterTextSplitter( | ||
chunk_size=chunk_size, | ||
chunk_overlap=chunk_overlap, | ||
length_function=lambda text: len(text.split()), | ||
) | ||
|
||
def read(self): | ||
for content_text in self.get_text(): | ||
for chunk in self.splitter.split_text(content_text): | ||
embedding_engine = get_vector_engine().embedding_engine | ||
token_count = embedding_engine.tokenizer.count_tokens(chunk) | ||
if token_count <= self.max_chunk_tokens: | ||
yield DocumentChunk( | ||
id=uuid5(NAMESPACE_OID, chunk), | ||
text=chunk, | ||
word_count=len(chunk.split()), | ||
token_count=token_count, | ||
is_part_of=self.document, | ||
chunk_index=self.chunk_index, | ||
cut_type="missing", | ||
contains=[], | ||
metadata={ | ||
"index_fields": ["text"], | ||
}, | ||
) | ||
self.chunk_index += 1 | ||
else: | ||
raise ValueError( | ||
f"Chunk of {token_count} tokens is larger than the maximum of {self.max_chunk_tokens} tokens. Please reduce chunk_size in RecursiveCharacterTextSplitter." | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
5 changes: 3 additions & 2 deletions
5
cognee/modules/data/processing/document_types/TextDocument.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters