Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: set user being able to set chunk size and overlap for indices #524

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions libs/ktem/ktem/index/file/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class BaseFileIndexIndexing(BaseComponent):
FSPath = Param(help="The file storage path")
user_id = Param(help="The user id")
private = Param(False, help="Whether this is private index")
user_chunk_size = Param(help="Chunk size set by user")
user_chunk_overlap = Param(help="Chunk overlap set by user")

def run(
self, file_paths: str | Path | list[str | Path], *args, **kwargs
Expand Down
21 changes: 21 additions & 0 deletions libs/ktem/ktem/index/file/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,25 @@ def get_admin_settings(cls):
"choices": [("Yes", True), ("No", False)],
"info": "If private, files will not be accessible across users.",
},
"chunk_size": {
"name": "Size of chunk",
"value": -1,
"component": "number",
"info": (
"Number of characters of each text segment. "
"Set -1 to use developer setting."
),
},
"chunk_overlap": {
"name": "Max number of characters that can be overlap between segments",
"value": -1,
"component": "number",
"info": (
"Number of characters that consecutive text segments "
"should overlap with each other. "
"Set -1 to use developer setting."
),
},
}

def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing:
Expand All @@ -423,6 +442,8 @@ def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing:
obj.FSPath = self._fs_path
obj.user_id = user_id
obj.private = self.config.get("private", False)
obj.user_chunk_size = self.config.get("chunk_size", -1)
obj.user_chunk_overlap = self.config.get("chunk_overlap", -1)

return obj

Expand Down
14 changes: 13 additions & 1 deletion libs/ktem/ktem/index/file/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,17 @@ def route(self, file_path: str | Path) -> IndexPipeline:

Can subclass this method for a more elaborate pipeline routing strategy.
"""
_, chunk_size, chunk_overlap = dev_settings()

_, dev_chunk_size, dev_chunk_overlap = dev_settings()

chunk_size = (
self.user_chunk_size if self.user_chunk_size > 0 else dev_chunk_size
)
chunk_overlap = (
self.user_chunk_overlap
if self.user_chunk_overlap > 0
else dev_chunk_overlap
)

# check if file_path is a URL
if self.is_url(file_path):
Expand All @@ -744,6 +754,8 @@ def route(self, file_path: str | Path) -> IndexPipeline:
"the suitable pipeline for this file type in the settings."
)

print(f"Chunk size: {chunk_size}, chunk overlap: {chunk_overlap}")

print("Using reader", reader)
pipeline: IndexPipeline = IndexPipeline(
loader=reader,
Expand Down
Loading