Merge pull request #443 from bbrowning/separation-of-concerns

Split up `generate_data` and add a `mix_datasets` top level API
instructlab · Jan 16, 2025 · 695c651 · 695c651
2 parents a532a8d + 1f9394d
commit 695c651
Show file tree

Hide file tree

Showing 25 changed files with 1,037 additions and 215 deletions.
diff --git a/.gitignore b/.gitignore
@@ -167,3 +167,6 @@ cython_debug/
 
 # IDEs
 .vscode/
+
+# SDG examples output
+docs/examples/**/output
diff --git a/docs/examples/mix_datasets/concatenate_recipe.yaml b/docs/examples/mix_datasets/concatenate_recipe.yaml
@@ -0,0 +1,8 @@
+# An example of how to concatenate two datasets
+# Each dataset has a sampling_size of 1.0 to take all samples from both
+datasets:
+- path: dataset_1.jsonl
+  sampling_size: 1.0
+- path: dataset_2.jsonl
+  sampling_size: 1.0
+sys_prompt: I am a reliable AI assistant.
diff --git a/docs/examples/mix_datasets/dataset_1.jsonl b/docs/examples/mix_datasets/dataset_1.jsonl
@@ -0,0 +1,5 @@
+{"id": "dataset_1_1", "messages": [], "metadata": {}}
+{"id": "dataset_1_2", "messages": [], "metadata": {}}
+{"id": "dataset_1_3", "messages": [], "metadata": {}}
+{"id": "dataset_1_4", "messages": [], "metadata": {}}
+{"id": "dataset_1_5", "messages": [], "metadata": {}}
diff --git a/docs/examples/mix_datasets/dataset_2.jsonl b/docs/examples/mix_datasets/dataset_2.jsonl
@@ -0,0 +1,5 @@
+{"id": "dataset_2_1", "messages": [], "metadata": {}}
+{"id": "dataset_2_2", "messages": [], "metadata": {}}
+{"id": "dataset_2_3", "messages": [], "metadata": {}}
+{"id": "dataset_2_4", "messages": [], "metadata": {}}
+{"id": "dataset_2_5", "messages": [], "metadata": {}}
diff --git a/docs/examples/mix_datasets/example_mixing.py b/docs/examples/mix_datasets/example_mixing.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Standard
+from pathlib import Path
+
+# First Party
+from instructlab.sdg import mix_datasets
+
+output_dir = Path(__file__).parent.joinpath("output")
+output_dir.mkdir(exist_ok=True)
+
+concatenate_recipe_yaml = Path(__file__).parent.joinpath("concatenate_recipe.yaml")
+concatenated_output_jsonl = output_dir.joinpath("concatenated.jsonl")
+mix_datasets(concatenate_recipe_yaml, concatenated_output_jsonl)
+
+weighted_recipe_yaml = Path(__file__).parent.joinpath("weighted_recipe.yaml")
+weighted_output_jsonl = output_dir.joinpath("weighted.jsonl")
+mix_datasets(weighted_recipe_yaml, weighted_output_jsonl)
diff --git a/docs/examples/mix_datasets/weighted_recipe.yaml b/docs/examples/mix_datasets/weighted_recipe.yaml
@@ -0,0 +1,9 @@
+# An example of how to weight one dataset over another
+# Dataset 1 has a sampling size of 2.0 to double its samples
+# Dataset 2 has a sampling size of 0.2 to take 20% of its samples
+datasets:
+- path: dataset_1.jsonl
+  sampling_size: 2.0
+- path: dataset_2.jsonl
+  sampling_size: 0.2
+sys_prompt: I am a reliable AI assistant.
diff --git a/src/instructlab/sdg/__init__.py b/src/instructlab/sdg/__init__.py
@@ -29,6 +29,7 @@
     "FULL_PIPELINES_PACKAGE",
     "SIMPLE_PIPELINES_PACKAGE",
     "generate_data",
+    "mix_datasets",
 )
 
 # Local
@@ -50,7 +51,7 @@
     SelectorBlock,
     SetToMajorityValueBlock,
 )
-from .generate_data import generate_data
+from .generate_data import generate_data, mix_datasets
 from .pipeline import (
     FULL_PIPELINES_PACKAGE,
     SIMPLE_PIPELINES_PACKAGE,

diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py
@@ -160,7 +160,7 @@ def _create_mixed_dataset(self, num_proc):
         Create the final mixed dataset by loading, sampling, and
         concatenating all datasets in this recipe
         """
-        if not self.dataset_added:
+        if not self.datasets:
             logger.error("No dataset added to the recipe")
 
         mixed_ds = self._load_and_sample_datasets(num_proc)
@@ -726,19 +726,36 @@ def collect(
                 sampling_size=self.NUM_SYNTH_SKILLS,
             )
 
+    def _write_mixed_recipe(self, recipe, output_file_recipe):
+        """
+        Write the recipes created during data mixing without writing the actual
+        mixed datasets to disk.
+        """
+        full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
+        recipe.save_recipe(full_recipe_path)
+
     def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data):
         """
         Mix the generated leaf node data into a single dataset and write it to
         disk. The heavy lifting is delegated to the Recipe class.
         """
+        self._write_mixed_recipe(recipe, output_file_recipe)
         if recipe.dataset_added:
-            full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
-            recipe.save_recipe(full_recipe_path)
             recipe.save_mixed_dataset(
                 os.path.join(self.output_dir, output_file_data),
                 self.num_procs,
             )
 
+    def write_recipes(self):
+        self._write_mixed_recipe(
+            self.knowledge_recipe,
+            self.output_file_knowledge_recipe,
+        )
+        self._write_mixed_recipe(
+            self.skills_recipe,
+            self.output_file_skills_recipe,
+        )
+
     def generate(self):
         self._gen_mixed_data(
             self.knowledge_recipe,