Add examples for how to use data mixing

This adds a new docs/examples/mix_datasets folders with a couple of example recipes, two sample datasets, and an example_mixing.py Python script to show how to mix datasets. This also adds a test_examples.py file that actually runs out examples, ensuring they work without error and generate the expected mixed datasets. Signed-off-by: Ben Browning <[email protected]>
instructlab · Jan 16, 2025 · 1f9394d · 1f9394d
1 parent 6c8544e
commit 1f9394d
Show file tree

Hide file tree

Showing 11 changed files with 203 additions and 77 deletions.
diff --git a/.gitignore b/.gitignore
@@ -167,3 +167,6 @@ cython_debug/
 
 # IDEs
 .vscode/
+
+# SDG examples output
+docs/examples/**/output
diff --git a/docs/examples/mix_datasets/concatenate_recipe.yaml b/docs/examples/mix_datasets/concatenate_recipe.yaml
@@ -0,0 +1,8 @@
+# An example of how to concatenate two datasets
+# Each dataset has a sampling_size of 1.0 to take all samples from both
+datasets:
+- path: dataset_1.jsonl
+  sampling_size: 1.0
+- path: dataset_2.jsonl
+  sampling_size: 1.0
+sys_prompt: I am a reliable AI assistant.
diff --git a/docs/examples/mix_datasets/dataset_1.jsonl b/docs/examples/mix_datasets/dataset_1.jsonl
@@ -0,0 +1,5 @@
+{"id": "dataset_1_1", "messages": [], "metadata": {}}
+{"id": "dataset_1_2", "messages": [], "metadata": {}}
+{"id": "dataset_1_3", "messages": [], "metadata": {}}
+{"id": "dataset_1_4", "messages": [], "metadata": {}}
+{"id": "dataset_1_5", "messages": [], "metadata": {}}
diff --git a/docs/examples/mix_datasets/dataset_2.jsonl b/docs/examples/mix_datasets/dataset_2.jsonl
@@ -0,0 +1,5 @@
+{"id": "dataset_2_1", "messages": [], "metadata": {}}
+{"id": "dataset_2_2", "messages": [], "metadata": {}}
+{"id": "dataset_2_3", "messages": [], "metadata": {}}
+{"id": "dataset_2_4", "messages": [], "metadata": {}}
+{"id": "dataset_2_5", "messages": [], "metadata": {}}
diff --git a/docs/examples/mix_datasets/example_mixing.py b/docs/examples/mix_datasets/example_mixing.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Standard
+from pathlib import Path
+
+# First Party
+from instructlab.sdg import mix_datasets
+
+output_dir = Path(__file__).parent.joinpath("output")
+output_dir.mkdir(exist_ok=True)
+
+concatenate_recipe_yaml = Path(__file__).parent.joinpath("concatenate_recipe.yaml")
+concatenated_output_jsonl = output_dir.joinpath("concatenated.jsonl")
+mix_datasets(concatenate_recipe_yaml, concatenated_output_jsonl)
+
+weighted_recipe_yaml = Path(__file__).parent.joinpath("weighted_recipe.yaml")
+weighted_output_jsonl = output_dir.joinpath("weighted.jsonl")
+mix_datasets(weighted_recipe_yaml, weighted_output_jsonl)
diff --git a/docs/examples/mix_datasets/weighted_recipe.yaml b/docs/examples/mix_datasets/weighted_recipe.yaml
@@ -0,0 +1,9 @@
+# An example of how to weight one dataset over another
+# Dataset 1 has a sampling size of 2.0 to double its samples
+# Dataset 2 has a sampling size of 0.2 to take 20% of its samples
+datasets:
+- path: dataset_1.jsonl
+  sampling_size: 2.0
+- path: dataset_2.jsonl
+  sampling_size: 0.2
+sys_prompt: I am a reliable AI assistant.
diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py
@@ -6,9 +6,16 @@
 import pytest
 
 TESTS_PATH = pathlib.Path(__file__).parent.parent.absolute()
+EXAMPLES_PATH = TESTS_PATH.parent.joinpath("docs", "examples")
 
 
 @pytest.fixture
 def testdata_path() -> typing.Generator[pathlib.Path, None, None]:
     """Path to local test data directory"""
     yield TESTS_PATH / "testdata"
+
+
+@pytest.fixture
+def examples_path() -> typing.Generator[pathlib.Path, None, None]:
+    """Path to examples directory"""
+    yield EXAMPLES_PATH
diff --git a/tests/functional/test_examples.py b/tests/functional/test_examples.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Standard
+import pathlib
+import shutil
+import subprocess
+import sys
+
+# First Party
+from instructlab.sdg.utils.json import jlload
+
+
+def test_example_mixing(tmp_path: pathlib.Path, examples_path: pathlib.Path):
+    example_copy_path = tmp_path.joinpath("mix_datasets")
+    shutil.copytree(examples_path.joinpath("mix_datasets"), example_copy_path)
+    script = example_copy_path.joinpath("example_mixing.py")
+    subprocess.check_call([sys.executable, str(script)], text=True)
+
+    concatenated = jlload(example_copy_path.joinpath("output", "concatenated.jsonl"))
+    assert len(concatenated) == 10
+    from_ds_1 = []
+    from_ds_2 = []
+    for sample in concatenated:
+        if sample["id"].startswith("dataset_1"):
+            from_ds_1.append(sample)
+        else:
+            from_ds_2.append(sample)
+    assert len(from_ds_1) == len(from_ds_2) == 5
+
+    weighted = jlload(example_copy_path.joinpath("output", "weighted.jsonl"))
+    assert len(weighted) == 11
+    from_ds_1 = []
+    from_ds_2 = []
+    for sample in weighted:
+        if sample["id"].startswith("dataset_1"):
+            from_ds_1.append(sample)
+        else:
+            from_ds_2.append(sample)
+    assert len(from_ds_1) == 10
+    assert len(from_ds_2) == 1
diff --git a/tests/functional/test_granular_api.py b/tests/functional/test_granular_api.py
@@ -4,10 +4,13 @@
 from datetime import datetime
 from unittest.mock import MagicMock
 import glob
+import os
 import pathlib
+import unittest
 
 # Third Party
 import git
+import pytest
 
 # First Party
 from instructlab.sdg import BlockRegistry
@@ -20,6 +23,7 @@
 
 # Local
 from ..mockllmblock import MockLLMBlock
+from ..taxonomy import load_test_skills
 
 
 def _clone_instructlab_taxonomy(taxonomy_dir):
@@ -29,75 +33,99 @@ def _clone_instructlab_taxonomy(taxonomy_dir):
     repo.git.checkout(taxonomy_commit)
 
 
-def test_granular_api_end_to_end(testdata_path: pathlib.Path, tmp_path: pathlib.Path):
-    # Registry our mock block so we can reference it in pipelines
-    BlockRegistry.register("MockLLMBlock")(MockLLMBlock)
-
-    # Clone a taxonomy and edit 1 file in it
-    taxonomy_dir = tmp_path.joinpath("taxonomy")
-    _clone_instructlab_taxonomy(taxonomy_dir)
-    changed_qna_yaml = taxonomy_dir.joinpath(
-        "knowledge", "science", "animals", "birds", "black_capped_chickadee", "qna.yaml"
-    )
-    with open(changed_qna_yaml, "a", encoding="utf-8") as file:
-        file.write("")
-
-    pipeline_dir = testdata_path.joinpath("mock_pipelines")
-    date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
-
-    preprocessed_dir = tmp_path.joinpath("preprocessed")
-    teacher_model_path = testdata_path.joinpath("models/instructlab/granite-7b-lab")
-    preprocess_taxonomy(
-        taxonomy_dir=taxonomy_dir,
-        output_dir=preprocessed_dir,
-        teacher_model_path=teacher_model_path,
-    )
-    chickadee_docs = glob.glob(
-        str(
-            preprocessed_dir.joinpath(
-                "documents", "knowledge_science_*", "chickadee.md"
-            )
+class TestGranularAPI(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def _init_taxonomy(self, taxonomy_dir, testdata_path, tmp_path):
+        self.test_taxonomy = taxonomy_dir
+        self.testdata_path = testdata_path
+        self.tmp_path = tmp_path
+
+    def setUp(self):
+        test_valid_knowledge_skill_file = self.testdata_path.joinpath(
+            "test_valid_knowledge_skill.yaml"
+        )
+        untracked_knowledge_file = os.path.join("knowledge", "new", "qna.yaml")
+        test_valid_knowledge_skill = load_test_skills(test_valid_knowledge_skill_file)
+        self.test_taxonomy.create_untracked(
+            untracked_knowledge_file, test_valid_knowledge_skill
+        )
+
+    def file_list(self):
+        return glob.glob(str(self.tmp_path.joinpath("**/*")), recursive=True)
+
+    def test_granular_api_end_to_end(self):
+        # Registry our mock block so we can reference it in pipelines
+        BlockRegistry.register("MockLLMBlock")(MockLLMBlock)
+
+        # Clone a taxonomy and edit 1 file in it
+        taxonomy_dir = self.tmp_path
+
+        pipeline_dir = self.testdata_path.joinpath("mock_pipelines")
+        date_suffix = (
+            datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
+        )
+
+        preprocessed_dir = self.tmp_path.joinpath("preprocessed")
+        teacher_model_path = self.testdata_path.joinpath(
+            "models/instructlab/granite-7b-lab"
+        )
+        preprocess_taxonomy(
+            taxonomy_dir=taxonomy_dir,
+            output_dir=preprocessed_dir,
+            teacher_model_path=teacher_model_path,
+        )
+        docs = glob.glob(
+            str(preprocessed_dir.joinpath("documents", "knowledge_new_*", "phoenix.md"))
+        )
+        assert docs, f"Expected docs not found in {self.file_list()}"
+        samples_path = preprocessed_dir.joinpath("knowledge_new.jsonl")
+        assert (
+            samples_path.is_file()
+        ), f"Expected samples file not found in {self.file_list()}"
+
+        client = MagicMock()
+        client.server_supports_batched = False
+        generated_dir = self.tmp_path.joinpath("generated")
+        generate_taxonomy(
+            client=client,
+            input_dir=preprocessed_dir,
+            output_dir=generated_dir,
+            pipeline=pipeline_dir,
+            num_cpus=1,  # Test is faster running on a single CPU vs forking
+            batch_size=0,  # Disable batch for tiny dataset and fastest test
+        )
+        generated_samples_path = generated_dir.joinpath("knowledge_new.jsonl")
+        assert (
+            generated_samples_path.is_file()
+        ), f"Generated samples not found in {self.file_list()}"
+
+        postprocessed_dir = self.tmp_path.joinpath("postprocessed")
+        postprocess_taxonomy(
+            input_dir=generated_dir,
+            output_dir=postprocessed_dir,
+            date_suffix=date_suffix,
+            pipeline=pipeline_dir,
+        )
+        knowledge_recipe_file = postprocessed_dir.joinpath(
+            f"knowledge_recipe_{date_suffix}.yaml"
+        )
+        assert (
+            knowledge_recipe_file.is_file()
+        ), f"Generated knowledge recipe file not found in {self.file_list()}"
+        skills_recipe_file = postprocessed_dir.joinpath(
+            f"skills_recipe_{date_suffix}.yaml"
+        )
+        assert (
+            skills_recipe_file.is_file()
+        ), f"Generated skills recipe file not found in {self.file_list()}"
+
+        mixed_skills_output_file = (
+            f"{postprocessed_dir}/skills_train_msgs_{date_suffix}.jsonl"
+        )
+        mix_datasets(
+            recipe_file=f"{postprocessed_dir}/skills_recipe_{date_suffix}.yaml",
+            output_file=mixed_skills_output_file,
         )
-    )
-    assert chickadee_docs
-    chickadee_samples_path = preprocessed_dir.joinpath(
-        "knowledge_science_animals_birds_black_capped_chickadee.jsonl"
-    )
-    assert chickadee_samples_path.is_file()
-
-    client = MagicMock()
-    client.server_supports_batched = False
-    generated_dir = tmp_path.joinpath("generated")
-    generate_taxonomy(
-        client=client,
-        input_dir=preprocessed_dir,
-        output_dir=generated_dir,
-        pipeline=pipeline_dir,
-    )
-    generated_chickadee_samples_path = generated_dir.joinpath(
-        "knowledge_science_animals_birds_black_capped_chickadee.jsonl"
-    )
-    assert generated_chickadee_samples_path.is_file()
-
-    postprocessed_dir = tmp_path.joinpath("postprocessed")
-    postprocess_taxonomy(
-        input_dir=generated_dir,
-        output_dir=postprocessed_dir,
-        date_suffix=date_suffix,
-        pipeline=pipeline_dir,
-    )
-    knowledge_recipe_file = postprocessed_dir.joinpath(
-        f"knowledge_recipe_{date_suffix}.yaml"
-    )
-    assert knowledge_recipe_file.is_file()
-    skills_recipe_file = postprocessed_dir.joinpath(f"skills_recipe_{date_suffix}.yaml")
-    assert skills_recipe_file.is_file()
-
-    mixed_skills_output_file = (
-        f"{postprocessed_dir}/skills_train_msgs_{date_suffix}.jsonl"
-    )
-    mix_datasets(
-        recipe_file=f"{postprocessed_dir}/skills_recipe_{date_suffix}.yaml",
-        output_file=mixed_skills_output_file,
-    )
-    assert pathlib.Path(mixed_skills_output_file).is_file()
+        assert pathlib.Path(
+            mixed_skills_output_file
+        ).is_file(), f"Generated mixed output not found in {self.file_list()}"
diff --git a/tests/taxonomy.py b/tests/taxonomy.py
@@ -2,7 +2,7 @@
 
 # Standard
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 import shutil
 
 # Third Party
@@ -68,3 +68,8 @@ def __enter__(self):
 
     def __exit__(self, *args):
         self.teardown()
+
+
+def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]:
+    with open(skills_file_path, "r", encoding="utf-8") as skills_file:
+        return yaml.safe_load(skills_file)
diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py
@@ -28,6 +28,9 @@
     generate_data,
 )
 
+# Local
+from .taxonomy import load_test_skills
+
 TEST_SYS_PROMPT = "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant."
 
 TEST_TAXONOMY_BASE = "main"
@@ -232,11 +235,6 @@ def add_question_mark(q):
     return train_samples
 
 
-def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]:
-    with open(skills_file_path, "r", encoding="utf-8") as skills_file:
-        return yaml.safe_load(skills_file)
-
-
 def _noop_llmblock_generate(self, samples):
     """Generate mock output based on input samples.