Skip to content

Commit

Permalink
Add examples for how to use data mixing
Browse files Browse the repository at this point in the history
This adds a new docs/examples/mix_datasets folders with a couple of
example recipes, two sample datasets, and an example_mixing.py Python
script to show how to mix datasets.

This also adds a test_examples.py file that actually runs out
examples, ensuring they work without error and generate the expected
mixed datasets.

Signed-off-by: Ben Browning <[email protected]>
  • Loading branch information
bbrowning committed Jan 16, 2025
1 parent 6c8544e commit 1f9394d
Show file tree
Hide file tree
Showing 11 changed files with 203 additions and 77 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,6 @@ cython_debug/

# IDEs
.vscode/

# SDG examples output
docs/examples/**/output
8 changes: 8 additions & 0 deletions docs/examples/mix_datasets/concatenate_recipe.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# An example of how to concatenate two datasets
# Each dataset has a sampling_size of 1.0 to take all samples from both
datasets:
- path: dataset_1.jsonl
sampling_size: 1.0
- path: dataset_2.jsonl
sampling_size: 1.0
sys_prompt: I am a reliable AI assistant.
5 changes: 5 additions & 0 deletions docs/examples/mix_datasets/dataset_1.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"id": "dataset_1_1", "messages": [], "metadata": {}}
{"id": "dataset_1_2", "messages": [], "metadata": {}}
{"id": "dataset_1_3", "messages": [], "metadata": {}}
{"id": "dataset_1_4", "messages": [], "metadata": {}}
{"id": "dataset_1_5", "messages": [], "metadata": {}}
5 changes: 5 additions & 0 deletions docs/examples/mix_datasets/dataset_2.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"id": "dataset_2_1", "messages": [], "metadata": {}}
{"id": "dataset_2_2", "messages": [], "metadata": {}}
{"id": "dataset_2_3", "messages": [], "metadata": {}}
{"id": "dataset_2_4", "messages": [], "metadata": {}}
{"id": "dataset_2_5", "messages": [], "metadata": {}}
18 changes: 18 additions & 0 deletions docs/examples/mix_datasets/example_mixing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# SPDX-License-Identifier: Apache-2.0

# Standard
from pathlib import Path

# First Party
from instructlab.sdg import mix_datasets

output_dir = Path(__file__).parent.joinpath("output")
output_dir.mkdir(exist_ok=True)

concatenate_recipe_yaml = Path(__file__).parent.joinpath("concatenate_recipe.yaml")
concatenated_output_jsonl = output_dir.joinpath("concatenated.jsonl")
mix_datasets(concatenate_recipe_yaml, concatenated_output_jsonl)

weighted_recipe_yaml = Path(__file__).parent.joinpath("weighted_recipe.yaml")
weighted_output_jsonl = output_dir.joinpath("weighted.jsonl")
mix_datasets(weighted_recipe_yaml, weighted_output_jsonl)
9 changes: 9 additions & 0 deletions docs/examples/mix_datasets/weighted_recipe.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# An example of how to weight one dataset over another
# Dataset 1 has a sampling size of 2.0 to double its samples
# Dataset 2 has a sampling size of 0.2 to take 20% of its samples
datasets:
- path: dataset_1.jsonl
sampling_size: 2.0
- path: dataset_2.jsonl
sampling_size: 0.2
sys_prompt: I am a reliable AI assistant.
7 changes: 7 additions & 0 deletions tests/functional/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,16 @@
import pytest

TESTS_PATH = pathlib.Path(__file__).parent.parent.absolute()
EXAMPLES_PATH = TESTS_PATH.parent.joinpath("docs", "examples")


@pytest.fixture
def testdata_path() -> typing.Generator[pathlib.Path, None, None]:
"""Path to local test data directory"""
yield TESTS_PATH / "testdata"


@pytest.fixture
def examples_path() -> typing.Generator[pathlib.Path, None, None]:
"""Path to examples directory"""
yield EXAMPLES_PATH
40 changes: 40 additions & 0 deletions tests/functional/test_examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# SPDX-License-Identifier: Apache-2.0

# Standard
import pathlib
import shutil
import subprocess
import sys

# First Party
from instructlab.sdg.utils.json import jlload


def test_example_mixing(tmp_path: pathlib.Path, examples_path: pathlib.Path):
example_copy_path = tmp_path.joinpath("mix_datasets")
shutil.copytree(examples_path.joinpath("mix_datasets"), example_copy_path)
script = example_copy_path.joinpath("example_mixing.py")
subprocess.check_call([sys.executable, str(script)], text=True)

concatenated = jlload(example_copy_path.joinpath("output", "concatenated.jsonl"))
assert len(concatenated) == 10
from_ds_1 = []
from_ds_2 = []
for sample in concatenated:
if sample["id"].startswith("dataset_1"):
from_ds_1.append(sample)
else:
from_ds_2.append(sample)
assert len(from_ds_1) == len(from_ds_2) == 5

weighted = jlload(example_copy_path.joinpath("output", "weighted.jsonl"))
assert len(weighted) == 11
from_ds_1 = []
from_ds_2 = []
for sample in weighted:
if sample["id"].startswith("dataset_1"):
from_ds_1.append(sample)
else:
from_ds_2.append(sample)
assert len(from_ds_1) == 10
assert len(from_ds_2) == 1
170 changes: 99 additions & 71 deletions tests/functional/test_granular_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
from datetime import datetime
from unittest.mock import MagicMock
import glob
import os
import pathlib
import unittest

# Third Party
import git
import pytest

# First Party
from instructlab.sdg import BlockRegistry
Expand All @@ -20,6 +23,7 @@

# Local
from ..mockllmblock import MockLLMBlock
from ..taxonomy import load_test_skills


def _clone_instructlab_taxonomy(taxonomy_dir):
Expand All @@ -29,75 +33,99 @@ def _clone_instructlab_taxonomy(taxonomy_dir):
repo.git.checkout(taxonomy_commit)


def test_granular_api_end_to_end(testdata_path: pathlib.Path, tmp_path: pathlib.Path):
# Registry our mock block so we can reference it in pipelines
BlockRegistry.register("MockLLMBlock")(MockLLMBlock)

# Clone a taxonomy and edit 1 file in it
taxonomy_dir = tmp_path.joinpath("taxonomy")
_clone_instructlab_taxonomy(taxonomy_dir)
changed_qna_yaml = taxonomy_dir.joinpath(
"knowledge", "science", "animals", "birds", "black_capped_chickadee", "qna.yaml"
)
with open(changed_qna_yaml, "a", encoding="utf-8") as file:
file.write("")

pipeline_dir = testdata_path.joinpath("mock_pipelines")
date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")

preprocessed_dir = tmp_path.joinpath("preprocessed")
teacher_model_path = testdata_path.joinpath("models/instructlab/granite-7b-lab")
preprocess_taxonomy(
taxonomy_dir=taxonomy_dir,
output_dir=preprocessed_dir,
teacher_model_path=teacher_model_path,
)
chickadee_docs = glob.glob(
str(
preprocessed_dir.joinpath(
"documents", "knowledge_science_*", "chickadee.md"
)
class TestGranularAPI(unittest.TestCase):
@pytest.fixture(autouse=True)
def _init_taxonomy(self, taxonomy_dir, testdata_path, tmp_path):
self.test_taxonomy = taxonomy_dir
self.testdata_path = testdata_path
self.tmp_path = tmp_path

def setUp(self):
test_valid_knowledge_skill_file = self.testdata_path.joinpath(
"test_valid_knowledge_skill.yaml"
)
untracked_knowledge_file = os.path.join("knowledge", "new", "qna.yaml")
test_valid_knowledge_skill = load_test_skills(test_valid_knowledge_skill_file)
self.test_taxonomy.create_untracked(
untracked_knowledge_file, test_valid_knowledge_skill
)

def file_list(self):
return glob.glob(str(self.tmp_path.joinpath("**/*")), recursive=True)

def test_granular_api_end_to_end(self):
# Registry our mock block so we can reference it in pipelines
BlockRegistry.register("MockLLMBlock")(MockLLMBlock)

# Clone a taxonomy and edit 1 file in it
taxonomy_dir = self.tmp_path

pipeline_dir = self.testdata_path.joinpath("mock_pipelines")
date_suffix = (
datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
)

preprocessed_dir = self.tmp_path.joinpath("preprocessed")
teacher_model_path = self.testdata_path.joinpath(
"models/instructlab/granite-7b-lab"
)
preprocess_taxonomy(
taxonomy_dir=taxonomy_dir,
output_dir=preprocessed_dir,
teacher_model_path=teacher_model_path,
)
docs = glob.glob(
str(preprocessed_dir.joinpath("documents", "knowledge_new_*", "phoenix.md"))
)
assert docs, f"Expected docs not found in {self.file_list()}"
samples_path = preprocessed_dir.joinpath("knowledge_new.jsonl")
assert (
samples_path.is_file()
), f"Expected samples file not found in {self.file_list()}"

client = MagicMock()
client.server_supports_batched = False
generated_dir = self.tmp_path.joinpath("generated")
generate_taxonomy(
client=client,
input_dir=preprocessed_dir,
output_dir=generated_dir,
pipeline=pipeline_dir,
num_cpus=1, # Test is faster running on a single CPU vs forking
batch_size=0, # Disable batch for tiny dataset and fastest test
)
generated_samples_path = generated_dir.joinpath("knowledge_new.jsonl")
assert (
generated_samples_path.is_file()
), f"Generated samples not found in {self.file_list()}"

postprocessed_dir = self.tmp_path.joinpath("postprocessed")
postprocess_taxonomy(
input_dir=generated_dir,
output_dir=postprocessed_dir,
date_suffix=date_suffix,
pipeline=pipeline_dir,
)
knowledge_recipe_file = postprocessed_dir.joinpath(
f"knowledge_recipe_{date_suffix}.yaml"
)
assert (
knowledge_recipe_file.is_file()
), f"Generated knowledge recipe file not found in {self.file_list()}"
skills_recipe_file = postprocessed_dir.joinpath(
f"skills_recipe_{date_suffix}.yaml"
)
assert (
skills_recipe_file.is_file()
), f"Generated skills recipe file not found in {self.file_list()}"

mixed_skills_output_file = (
f"{postprocessed_dir}/skills_train_msgs_{date_suffix}.jsonl"
)
mix_datasets(
recipe_file=f"{postprocessed_dir}/skills_recipe_{date_suffix}.yaml",
output_file=mixed_skills_output_file,
)
)
assert chickadee_docs
chickadee_samples_path = preprocessed_dir.joinpath(
"knowledge_science_animals_birds_black_capped_chickadee.jsonl"
)
assert chickadee_samples_path.is_file()

client = MagicMock()
client.server_supports_batched = False
generated_dir = tmp_path.joinpath("generated")
generate_taxonomy(
client=client,
input_dir=preprocessed_dir,
output_dir=generated_dir,
pipeline=pipeline_dir,
)
generated_chickadee_samples_path = generated_dir.joinpath(
"knowledge_science_animals_birds_black_capped_chickadee.jsonl"
)
assert generated_chickadee_samples_path.is_file()

postprocessed_dir = tmp_path.joinpath("postprocessed")
postprocess_taxonomy(
input_dir=generated_dir,
output_dir=postprocessed_dir,
date_suffix=date_suffix,
pipeline=pipeline_dir,
)
knowledge_recipe_file = postprocessed_dir.joinpath(
f"knowledge_recipe_{date_suffix}.yaml"
)
assert knowledge_recipe_file.is_file()
skills_recipe_file = postprocessed_dir.joinpath(f"skills_recipe_{date_suffix}.yaml")
assert skills_recipe_file.is_file()

mixed_skills_output_file = (
f"{postprocessed_dir}/skills_train_msgs_{date_suffix}.jsonl"
)
mix_datasets(
recipe_file=f"{postprocessed_dir}/skills_recipe_{date_suffix}.yaml",
output_file=mixed_skills_output_file,
)
assert pathlib.Path(mixed_skills_output_file).is_file()
assert pathlib.Path(
mixed_skills_output_file
).is_file(), f"Generated mixed output not found in {self.file_list()}"
7 changes: 6 additions & 1 deletion tests/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Standard
from pathlib import Path
from typing import Any, Dict, List
from typing import Any, Dict, List, Union
import shutil

# Third Party
Expand Down Expand Up @@ -68,3 +68,8 @@ def __enter__(self):

def __exit__(self, *args):
self.teardown()


def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]:
with open(skills_file_path, "r", encoding="utf-8") as skills_file:
return yaml.safe_load(skills_file)
8 changes: 3 additions & 5 deletions tests/test_generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
generate_data,
)

# Local
from .taxonomy import load_test_skills

TEST_SYS_PROMPT = "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant."

TEST_TAXONOMY_BASE = "main"
Expand Down Expand Up @@ -232,11 +235,6 @@ def add_question_mark(q):
return train_samples


def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]:
with open(skills_file_path, "r", encoding="utf-8") as skills_file:
return yaml.safe_load(skills_file)


def _noop_llmblock_generate(self, samples):
"""Generate mock output based on input samples.
Expand Down

0 comments on commit 1f9394d

Please sign in to comment.