-
Notifications
You must be signed in to change notification settings - Fork 43
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update test_chunkers to be more modular
Signed-off-by: Aakanksha Duggal <[email protected]>
- Loading branch information
1 parent
0b9a2fd
commit c84fa40
Showing
1 changed file
with
24 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,63 +1,50 @@ | ||
# Standard | ||
from pathlib import Path | ||
import os | ||
from pathlib import Path | ||
|
||
# Third Party | ||
import pytest | ||
|
||
# First Party | ||
from instructlab.sdg.utils.chunkers import DocumentChunker | ||
|
||
# Constants for Test Directory and Test Documents | ||
TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "testdata") | ||
TEST_DOCUMENTS = { | ||
"pdf": "sample_documents/phoenix.pdf", | ||
"md": "sample_documents/phoenix.md" | ||
} | ||
|
||
@pytest.fixture(scope="module") | ||
def test_paths(): | ||
"""Fixture to return paths to test documents.""" | ||
return {doc_type: Path(os.path.join(TEST_DATA_DIR, path)) for doc_type, path in TEST_DOCUMENTS.items()} | ||
|
||
@pytest.fixture | ||
def tokenizer_model_name(): | ||
"""Fixture to return the path to the tokenizer model.""" | ||
return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab") | ||
|
||
|
||
def test_chunk_pdf(tmp_path, tokenizer_model_name): | ||
pdf_path = Path(os.path.join(TEST_DATA_DIR, "sample_documents", "phoenix.pdf")) | ||
leaf_node = [ | ||
{ | ||
"documents": ["Lorem ipsum"], | ||
"filepaths": [pdf_path], | ||
"taxonomy_path": "knowledge", | ||
} | ||
] | ||
@pytest.mark.parametrize("doc_type, expected_chunks, contains_text", [ | ||
("pdf", 9, "Phoenix is a minor constellation"), | ||
("md", 7, None) # Assuming there's no specific text to check in Markdown | ||
]) | ||
def test_chunk_documents(tmp_path, tokenizer_model_name, test_paths, doc_type, expected_chunks, contains_text): | ||
""" | ||
Generalized test function for chunking documents. | ||
""" | ||
document_path = test_paths[doc_type] | ||
chunker = DocumentChunker( | ||
document_paths=[pdf_path], | ||
document_paths=[document_path], | ||
output_dir=tmp_path, | ||
tokenizer_model_name=tokenizer_model_name, | ||
server_ctx_size=4096, | ||
chunk_word_count=500, | ||
) | ||
chunks = chunker.chunk_documents() | ||
assert len(chunks) > 9 | ||
assert "Phoenix is a minor constellation" in chunks[0] | ||
assert len(chunks) > expected_chunks | ||
if contains_text: | ||
assert contains_text in chunks[0] | ||
for chunk in chunks: | ||
# inexact sanity-checking of chunk max length | ||
assert len(chunk) < 2500 | ||
|
||
|
||
def test_chunk_md(tmp_path, tokenizer_model_name): | ||
markdown_path = Path(os.path.join(TEST_DATA_DIR, "sample_documents", "phoenix.md")) | ||
leaf_node = [ | ||
{ | ||
"documents": [markdown_path.read_text(encoding="utf-8")], | ||
"filepaths": [markdown_path], | ||
"taxonomy_path": "knowledge", | ||
} | ||
] | ||
chunker = DocumentChunker( | ||
document_paths=[markdown_path], | ||
output_dir=tmp_path, | ||
tokenizer_model_name=tokenizer_model_name, | ||
server_ctx_size=4096, | ||
chunk_word_count=500, | ||
) | ||
chunks = chunker.chunk_documents() | ||
assert len(chunks) > 7 | ||
for chunk in chunks: | ||
# inexact sanity-checking of chunk max length | ||
assert len(chunk) < 2500 |