Skip to content

Commit

Permalink
Merge pull request #443 from bbrowning/separation-of-concerns
Browse files Browse the repository at this point in the history
Split up `generate_data` and add a `mix_datasets` top level API
  • Loading branch information
mergify[bot] authored Jan 16, 2025
2 parents a532a8d + 1f9394d commit 695c651
Show file tree
Hide file tree
Showing 25 changed files with 1,037 additions and 215 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,6 @@ cython_debug/

# IDEs
.vscode/

# SDG examples output
docs/examples/**/output
8 changes: 8 additions & 0 deletions docs/examples/mix_datasets/concatenate_recipe.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# An example of how to concatenate two datasets
# Each dataset has a sampling_size of 1.0 to take all samples from both
datasets:
- path: dataset_1.jsonl
sampling_size: 1.0
- path: dataset_2.jsonl
sampling_size: 1.0
sys_prompt: I am a reliable AI assistant.
5 changes: 5 additions & 0 deletions docs/examples/mix_datasets/dataset_1.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"id": "dataset_1_1", "messages": [], "metadata": {}}
{"id": "dataset_1_2", "messages": [], "metadata": {}}
{"id": "dataset_1_3", "messages": [], "metadata": {}}
{"id": "dataset_1_4", "messages": [], "metadata": {}}
{"id": "dataset_1_5", "messages": [], "metadata": {}}
5 changes: 5 additions & 0 deletions docs/examples/mix_datasets/dataset_2.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"id": "dataset_2_1", "messages": [], "metadata": {}}
{"id": "dataset_2_2", "messages": [], "metadata": {}}
{"id": "dataset_2_3", "messages": [], "metadata": {}}
{"id": "dataset_2_4", "messages": [], "metadata": {}}
{"id": "dataset_2_5", "messages": [], "metadata": {}}
18 changes: 18 additions & 0 deletions docs/examples/mix_datasets/example_mixing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# SPDX-License-Identifier: Apache-2.0

# Standard
from pathlib import Path

# First Party
from instructlab.sdg import mix_datasets

output_dir = Path(__file__).parent.joinpath("output")
output_dir.mkdir(exist_ok=True)

concatenate_recipe_yaml = Path(__file__).parent.joinpath("concatenate_recipe.yaml")
concatenated_output_jsonl = output_dir.joinpath("concatenated.jsonl")
mix_datasets(concatenate_recipe_yaml, concatenated_output_jsonl)

weighted_recipe_yaml = Path(__file__).parent.joinpath("weighted_recipe.yaml")
weighted_output_jsonl = output_dir.joinpath("weighted.jsonl")
mix_datasets(weighted_recipe_yaml, weighted_output_jsonl)
9 changes: 9 additions & 0 deletions docs/examples/mix_datasets/weighted_recipe.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# An example of how to weight one dataset over another
# Dataset 1 has a sampling size of 2.0 to double its samples
# Dataset 2 has a sampling size of 0.2 to take 20% of its samples
datasets:
- path: dataset_1.jsonl
sampling_size: 2.0
- path: dataset_2.jsonl
sampling_size: 0.2
sys_prompt: I am a reliable AI assistant.
3 changes: 2 additions & 1 deletion src/instructlab/sdg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"FULL_PIPELINES_PACKAGE",
"SIMPLE_PIPELINES_PACKAGE",
"generate_data",
"mix_datasets",
)

# Local
Expand All @@ -50,7 +51,7 @@
SelectorBlock,
SetToMajorityValueBlock,
)
from .generate_data import generate_data
from .generate_data import generate_data, mix_datasets
from .pipeline import (
FULL_PIPELINES_PACKAGE,
SIMPLE_PIPELINES_PACKAGE,
Expand Down
23 changes: 20 additions & 3 deletions src/instructlab/sdg/datamixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def _create_mixed_dataset(self, num_proc):
Create the final mixed dataset by loading, sampling, and
concatenating all datasets in this recipe
"""
if not self.dataset_added:
if not self.datasets:
logger.error("No dataset added to the recipe")

mixed_ds = self._load_and_sample_datasets(num_proc)
Expand Down Expand Up @@ -726,19 +726,36 @@ def collect(
sampling_size=self.NUM_SYNTH_SKILLS,
)

def _write_mixed_recipe(self, recipe, output_file_recipe):
"""
Write the recipes created during data mixing without writing the actual
mixed datasets to disk.
"""
full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
recipe.save_recipe(full_recipe_path)

def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data):
"""
Mix the generated leaf node data into a single dataset and write it to
disk. The heavy lifting is delegated to the Recipe class.
"""
self._write_mixed_recipe(recipe, output_file_recipe)
if recipe.dataset_added:
full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
recipe.save_recipe(full_recipe_path)
recipe.save_mixed_dataset(
os.path.join(self.output_dir, output_file_data),
self.num_procs,
)

def write_recipes(self):
self._write_mixed_recipe(
self.knowledge_recipe,
self.output_file_knowledge_recipe,
)
self._write_mixed_recipe(
self.skills_recipe,
self.output_file_skills_recipe,
)

def generate(self):
self._gen_mixed_data(
self.knowledge_recipe,
Expand Down
Loading

0 comments on commit 695c651

Please sign in to comment.