-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #349 from bbrowning/chunking_testing
Upgrade docling, expand chunking testing
- Loading branch information
Showing
10 changed files
with
379 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,3 +13,4 @@ ignores: | |
- ".github/**" | ||
- "venv/**" | ||
- ".venv/**" | ||
- "**/testdata/**" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# Standard | ||
from pathlib import Path | ||
import os | ||
|
||
# First Party | ||
from instructlab.sdg.utils.chunkers import DocumentChunker | ||
|
||
TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata") | ||
|
||
|
||
def test_chunk_pdf(tmp_path): | ||
leaf_node = [ | ||
{ | ||
"documents": ["Lorem ipsum"], | ||
"filepaths": [Path(os.path.join(TEST_DATA_DIR, "phoenix.pdf"))], | ||
"taxonomy_path": "knowledge", | ||
} | ||
] | ||
chunker = DocumentChunker( | ||
leaf_node=leaf_node, | ||
taxonomy_path=tmp_path, | ||
output_dir=tmp_path, | ||
server_ctx_size=4096, | ||
chunk_word_count=500, | ||
tokenizer_model_name="instructlab/merlinite-7b-lab", | ||
) | ||
chunks = chunker.chunk_documents() | ||
assert len(chunks) > 9 | ||
assert "Phoenix is a minor constellation" in chunks[0] | ||
for chunk in chunks: | ||
# inexact sanity-checking of chunk max length | ||
assert len(chunk) < 2500 | ||
|
||
|
||
def test_chunk_md(tmp_path): | ||
markdown_path = Path(os.path.join(TEST_DATA_DIR, "phoenix.md")) | ||
leaf_node = [ | ||
{ | ||
"documents": [markdown_path.read_text(encoding="utf-8")], | ||
"filepaths": [markdown_path], | ||
"taxonomy_path": "knowledge", | ||
} | ||
] | ||
chunker = DocumentChunker( | ||
leaf_node=leaf_node, | ||
taxonomy_path=tmp_path, | ||
output_dir=tmp_path, | ||
server_ctx_size=4096, | ||
chunk_word_count=500, | ||
tokenizer_model_name="instructlab/merlinite-7b-lab", | ||
) | ||
chunks = chunker.chunk_documents() | ||
assert len(chunks) > 7 | ||
for chunk in chunks: | ||
# inexact sanity-checking of chunk max length | ||
assert len(chunk) < 2500 |
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters