Skip to content

Commit

Permalink
Merge pull request #349 from bbrowning/chunking_testing
Browse files Browse the repository at this point in the history
Upgrade docling, expand chunking testing
  • Loading branch information
mergify[bot] authored Nov 8, 2024
2 parents fc709af + e558258 commit e0698d6
Show file tree
Hide file tree
Showing 10 changed files with 379 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ jobs:
python -m pip install --upgrade pip
python -m pip install tox tox-gh>=1.2
- name: Run unit tests with tox
- name: Run unit and functional tests with tox
run: |
tox
Expand Down
1 change: 1 addition & 0 deletions .markdownlint-cli2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ ignores:
- ".github/**"
- "venv/**"
- ".venv/**"
- "**/testdata/**"
2 changes: 1 addition & 1 deletion .spellcheck.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ matrix:
camel-case: true
mode: markdown
sources:
- "**/*.md|!.tox/**|!venv/**"
- "**/*.md|!.tox/**|!venv/**|!**/testdata/**"
dictionary:
wordlists:
- .spellcheck-en-custom.txt
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
click>=8.1.7,<9.0.0
datasets>=2.18.0,<3.0.0
docling>=2.3.0,<3.0.0
docling>=2.4.2,<3.0.0
GitPython>=3.1.42,<4.0.0
httpx>=0.25.0,<1.0.0
instructlab-schema>=0.4.0
Expand Down
2 changes: 2 additions & 0 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ def __new__(
doc_dict = cls._split_docs_by_filetype(documents, filepaths)
if len(doc_dict.keys()) > 1:
raise ValueError("Received multiple document types")
if len(doc_dict.keys()) < 1:
raise ValueError("Received no document types")

if FileTypes.MD in doc_dict:
doc_contents = [d for d, _ in doc_dict[FileTypes.MD]]
Expand Down
56 changes: 56 additions & 0 deletions tests/functional/test_chunkers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Standard
from pathlib import Path
import os

# First Party
from instructlab.sdg.utils.chunkers import DocumentChunker

TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata")


def test_chunk_pdf(tmp_path):
leaf_node = [
{
"documents": ["Lorem ipsum"],
"filepaths": [Path(os.path.join(TEST_DATA_DIR, "phoenix.pdf"))],
"taxonomy_path": "knowledge",
}
]
chunker = DocumentChunker(
leaf_node=leaf_node,
taxonomy_path=tmp_path,
output_dir=tmp_path,
server_ctx_size=4096,
chunk_word_count=500,
tokenizer_model_name="instructlab/merlinite-7b-lab",
)
chunks = chunker.chunk_documents()
assert len(chunks) > 9
assert "Phoenix is a minor constellation" in chunks[0]
for chunk in chunks:
# inexact sanity-checking of chunk max length
assert len(chunk) < 2500


def test_chunk_md(tmp_path):
markdown_path = Path(os.path.join(TEST_DATA_DIR, "phoenix.md"))
leaf_node = [
{
"documents": [markdown_path.read_text(encoding="utf-8")],
"filepaths": [markdown_path],
"taxonomy_path": "knowledge",
}
]
chunker = DocumentChunker(
leaf_node=leaf_node,
taxonomy_path=tmp_path,
output_dir=tmp_path,
server_ctx_size=4096,
chunk_word_count=500,
tokenizer_model_name="instructlab/merlinite-7b-lab",
)
chunks = chunker.chunk_documents()
assert len(chunks) > 7
for chunk in chunks:
# inexact sanity-checking of chunk max length
assert len(chunk) < 2500
284 changes: 284 additions & 0 deletions tests/functional/testdata/phoenix.md

Large diffs are not rendered by default.

Binary file added tests/functional/testdata/phoenix.pdf
Binary file not shown.
19 changes: 19 additions & 0 deletions tests/test_chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,22 @@ def test_chunker_factory_unsupported_filetype(documents_dir):
output_dir=temp_dir,
tokenizer_model_name="instructlab/merlinite-7b-lab",
)


def test_chunker_factory_empty_filetype(documents_dir):
"""Test that the DocumentChunker factory class fails when provided no document"""
leaf_node = [
{
"documents": [],
"taxonomy_path": "",
"filepaths": [],
}
]
with pytest.raises(ValueError):
with tempfile.TemporaryDirectory() as temp_dir:
_ = DocumentChunker(
leaf_node=leaf_node,
taxonomy_path=documents_dir,
output_dir=temp_dir,
tokenizer_model_name="instructlab/merlinite-7b-lab",
)
20 changes: 14 additions & 6 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
[tox]
# py3-unit runs unit tests with 'python3'
# py311-unit runs the same tests with 'python3.11'
envlist = ruff, lint, mypy, spellcheck, py3-unit
envlist = ruff, lint, mypy, spellcheck, py3-{unit, functional}
minversion = 4.4

[testenv]
description = run tests (unit, unitcov)
description = run tests (unit, unitcov, functional)
# Use PyTorch CPU build instead of CUDA build in test envs. CUDA dependencies
# are huge. This reduces venv from 5.7 GB to 1.5 GB.
setenv =
Expand All @@ -16,8 +16,16 @@ package = wheel
wheel_build_env = pkg
deps = -r requirements-dev.txt
commands =
unit: {envpython} -m pytest {posargs:tests}
unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.sdg --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests -m "not (examples or slow)"}
unit: {envpython} -m pytest {posargs:tests --ignore=tests/functional}
unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.sdg --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests --ignore=tests/functional -m "not (examples or slow)"}
functional: {envpython} -m pytest {posargs:tests/functional}
allowlist_externals =
functional: ./scripts/functional-tests.sh

[testenv:py3-functional]
setenv =
OPENAI_API_BASE={env:OPENAI_API_BASE:http://localhost:8000/v1}
OPENAI_API_KEY={env:OPENAI_API_KEY:EMPTY}

# format, check, and linting targets don't build and install the project to
# speed up testing.
Expand Down Expand Up @@ -82,5 +90,5 @@ commands =

[gh]
python =
3.11 = py311-unitcov
3.10 = py310-unitcov
3.11 = py311-{unitcov, functional}
3.10 = py310-{unitcov, functional}

0 comments on commit e0698d6

Please sign in to comment.