Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tests for the datamixing ensuring all reqd datasets are mixed appropriately #375

Merged
merged 2 commits into from
Jan 28, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 96 additions & 2 deletions tests/test_datamixing.py
Original file line number Diff line number Diff line change
@@ -10,17 +10,36 @@
import os

# Third Party
from datasets import Dataset
from datasets import Dataset, concatenate_datasets, load_dataset

# First Party
from instructlab.sdg.datamixing import DataMixer, Recipe, _add_extra_contexts_to_samples
from instructlab.sdg.datamixing import (
DataMixer,
Recipe,
_add_extra_contexts_to_samples,
_create_phase07_ds,
_create_phase10_ds,
)

# We mock out the actual things that use num_procs anyway, but just
# for a consistent value in the tests...
TEST_NUM_PROCS = 4
TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata")
TEST_RECIPE_PATH = os.path.join(TEST_DATA_DIR, "relative_path_recipe.yaml")
TEST_SAMPLES_ABS_PATH = os.path.join(TEST_DATA_DIR, "datasets/samples.jsonl")
TEST_KNOWLEDGE_PATH = os.path.join(TEST_DATA_DIR, "datasets/knowledge.jsonl")
TEST_KNOWLEDGE_SKILLS_PATH = os.path.join(
TEST_DATA_DIR, "datasets/knowledge_skills.jsonl"
)
TEST_AUXILIARY_PATH = os.path.join(TEST_DATA_DIR, "datasets/auxiliary.jsonl")


auxiliary_inst = {
"spellcheck": [
"Correct any spelling errors in the document and output the corrected version.",
"Rewrite the document to remove any spelling errors.",
]
}


def _empty_recipe(self):
@@ -31,6 +50,18 @@ def _noop_sample(dataset, _sampling_size, _num_procs):
return dataset


def load_knowledge_dataset():
return load_dataset("json", data_files=TEST_KNOWLEDGE_PATH, split="train")


def load_knowledge_skills_ds():
return load_dataset("json", data_files=TEST_KNOWLEDGE_SKILLS_PATH, split="train")


def load_auxiliary_dataset():
return load_dataset("json", data_files=TEST_AUXILIARY_PATH, split="train")


def _fake_context(msg_id):
return {
"context": f"context {msg_id}",
@@ -214,3 +245,66 @@ def test_add_extra_contexts_to_samples_with_six_samples_distractor_path():
assert f"context context{i+1}" not in sample_content
# ensure we have the expected number of contexts
assert sample_content.count("Document:\ncontext") == num_doc_in_context


@patch("instructlab.sdg.datamixing._create_auxiliary_dataset")
def test_phase07_creation(mock_auxiliary_dataset):
"""
Test Phase 0.7 dataset creation.

Phase 0.7 should include knowledge and auxiliary datasets.
"""
knowledge_dataset = load_knowledge_dataset()
auxiliary_dataset = load_auxiliary_dataset()
mock_auxiliary_dataset.return_value = auxiliary_dataset

# Create Phase 0.7 dataset
phase07_ds = _create_phase07_ds(
generated_dataset=knowledge_dataset,
auxiliary_inst=auxiliary_inst,
use_legacy_pretraining_format=False,
)

# Check if Phase 0.7 contains knowledge and auxiliary datasets
expected_phase07_size = len(knowledge_dataset) + len(auxiliary_dataset)
assert (
len(phase07_ds) == expected_phase07_size
), "Phase 0.7 should contain knowledge and auxiliary datasets."

# Verify that the content from all datasets is present in Phase 0.7
auxiliary_ids = {item["id"] for item in auxiliary_dataset}
phase07_ids = {item["id"] for item in phase07_ds}

assert auxiliary_ids.issubset(
phase07_ids
), "Phase 0.7 should include all auxiliary dataset entries."


@patch("instructlab.sdg.datamixing._create_auxiliary_dataset")
def test_phase10_creation(mock_auxiliary_dataset):
"""
Test Phase 1.0 dataset creation.

Phase 1.0 should include the content of Phase 0.7, along with auxiliary and knowledge_skills datasets.
"""
knowledge_dataset = load_knowledge_dataset()
auxiliary_dataset = load_auxiliary_dataset()
knowledge_skills_ds = load_knowledge_skills_ds()
mock_auxiliary_dataset.return_value = auxiliary_dataset

# Create Phase 1.0 dataset
phase10_ds = _create_phase10_ds(
generated_dataset=knowledge_skills_ds,
auxiliary_inst=auxiliary_inst,
use_legacy_pretraining_format=False,
)

# Expected size calculation for Phase 1.0
phase10_expected_size = (
len(knowledge_dataset) + len(knowledge_skills_ds) + len(auxiliary_dataset)
)

# Check if Phase 1.0 includes knowledge, auxiliary, and knowledge_skills content
assert (
len(phase10_ds) == phase10_expected_size
), "Phase 1.0 should contain the expected number of entries, including Phase 0.7 content."
2 changes: 2 additions & 0 deletions tests/testdata/datasets/auxiliary.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "aux_001", "messages": [{"content": "user message", "role": "user"}, {"content": "assistant message", "role": "assistant"}], "metadata": "{}"}
{"id": "aux_002", "messages": [{"content": "user message", "role": "user"}, {"content": "assistant message", "role": "assistant"}], "metadata": "{}"}
1 change: 1 addition & 0 deletions tests/testdata/datasets/knowledge.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id": "knowledge_001", "document": "the document", "question": "This is a question?", "response": "This is the generated response", "domain": "a domain", "metadata": "{}"}
1 change: 1 addition & 0 deletions tests/testdata/datasets/knowledge_skills.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id": "knowledge_skills_001", "document": "the document", "question": "This is a question?", "response": "This is the generated response", "domain": "a domain", "metadata": "{}"}