From 0f784655c38c4e1aa4426fc8b0b98b8c27fb01c3 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Thu, 5 Dec 2024 12:39:08 -0500
Subject: [PATCH 1/3] Split `generate_data` into multiple discrete steps

This doesn't move things out into separate files yet, but it does
split the existing functionality of `generate_date` into multiple
discrete steps and changes `generate_date` to just call those steps.

This is a step towards cleaner separation between the steps and
creating top-level Python APIs for each discrete step for advanced
use-cases that don't just want an entire single step generation pipeline.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 src/instructlab/sdg/__init__.py               |   3 +-
 src/instructlab/sdg/datamixing.py             |  23 +-
 src/instructlab/sdg/generate_data.py          | 595 +++++++++++++-----
 src/instructlab/sdg/pipeline.py               |   5 +-
 src/instructlab/sdg/utils/json.py             |   6 +
 src/instructlab/sdg/utils/models.py           |   9 +-
 src/instructlab/sdg/utils/taxonomy.py         |  30 +-
 tests/conftest.py                             |   2 +-
 tests/functional/__init__.py                  |   0
 tests/functional/test_granular_api.py         | 101 +++
 tests/mockllmblock.py                         |  55 ++
 tests/test_generate_data.py                   |  42 +-
 tests/test_models.py                          |   3 +
 .../mock_pipelines/freeform_skills.yaml       |  53 ++
 .../mock_pipelines/grounded_skills.yaml       |  70 +++
 tests/testdata/mock_pipelines/knowledge.yaml  | 113 ++++
 16 files changed, 902 insertions(+), 208 deletions(-)
 create mode 100644 tests/functional/__init__.py
 create mode 100644 tests/functional/test_granular_api.py
 create mode 100644 tests/mockllmblock.py
 create mode 100644 tests/testdata/mock_pipelines/freeform_skills.yaml
 create mode 100644 tests/testdata/mock_pipelines/grounded_skills.yaml
 create mode 100644 tests/testdata/mock_pipelines/knowledge.yaml

diff --git a/src/instructlab/sdg/__init__.py b/src/instructlab/sdg/__init__.py
index 490df8e4..5cc9d95f 100644
--- a/src/instructlab/sdg/__init__.py
+++ b/src/instructlab/sdg/__init__.py
@@ -29,6 +29,7 @@
     "FULL_PIPELINES_PACKAGE",
     "SIMPLE_PIPELINES_PACKAGE",
     "generate_data",
+    "mix_datasets",
 )
 
 # Local
@@ -50,7 +51,7 @@
     SelectorBlock,
     SetToMajorityValueBlock,
 )
-from .generate_data import generate_data
+from .generate_data import generate_data, mix_datasets
 from .pipeline import (
     FULL_PIPELINES_PACKAGE,
     SIMPLE_PIPELINES_PACKAGE,
diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py
index e6ca8675..de31e136 100644
--- a/src/instructlab/sdg/datamixing.py
+++ b/src/instructlab/sdg/datamixing.py
@@ -160,7 +160,7 @@ def _create_mixed_dataset(self, num_proc):
         Create the final mixed dataset by loading, sampling, and
         concatenating all datasets in this recipe
         """
-        if not self.dataset_added:
+        if not self.datasets:
             logger.error("No dataset added to the recipe")
 
         mixed_ds = self._load_and_sample_datasets(num_proc)
@@ -726,19 +726,36 @@ def collect(
                 sampling_size=self.NUM_SYNTH_SKILLS,
             )
 
+    def _write_mixed_recipe(self, recipe, output_file_recipe):
+        """
+        Write the recipes created during data mixing without writing the actual
+        mixed datasets to disk.
+        """
+        full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
+        recipe.save_recipe(full_recipe_path)
+
     def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data):
         """
         Mix the generated leaf node data into a single dataset and write it to
         disk. The heavy lifting is delegated to the Recipe class.
         """
+        self._write_mixed_recipe(recipe, output_file_recipe)
         if recipe.dataset_added:
-            full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
-            recipe.save_recipe(full_recipe_path)
             recipe.save_mixed_dataset(
                 os.path.join(self.output_dir, output_file_data),
                 self.num_procs,
             )
 
+    def write_recipes(self):
+        self._write_mixed_recipe(
+            self.knowledge_recipe,
+            self.output_file_knowledge_recipe,
+        )
+        self._write_mixed_recipe(
+            self.skills_recipe,
+            self.output_file_skills_recipe,
+        )
+
     def generate(self):
         self._gen_mixed_data(
             self.knowledge_recipe,
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index badf3834..a14a190a 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -5,7 +5,7 @@
 from importlib import resources
 from pathlib import Path
 from typing import Optional
-import dataclasses
+import glob
 import json
 import logging
 import os
@@ -13,13 +13,19 @@
 
 # Third Party
 # instructlab - All of these need to go away (other than sdg) - issue #6
+from datasets import Dataset
 from xdg_base_dirs import xdg_data_dirs, xdg_data_home
 import openai
 import yaml
 
 # First Party
 from instructlab.sdg.blocks.llmblock import DEFAULT_MAX_NUM_TOKENS
-from instructlab.sdg.datamixing import DataMixer, _get_question_hack, _get_response_hack
+from instructlab.sdg.datamixing import (
+    DataMixer,
+    Recipe,
+    _get_question_hack,
+    _get_response_hack,
+)
 from instructlab.sdg.eval_data import generate_eval_task_data, mmlubench_pipe_init
 from instructlab.sdg.pipeline import (
     FULL_PIPELINES_PACKAGE,
@@ -27,8 +33,8 @@
     Pipeline,
     PipelineContext,
 )
-from instructlab.sdg.utils import GenerateException, models
-from instructlab.sdg.utils.json import jldump
+from instructlab.sdg.utils import GenerateException
+from instructlab.sdg.utils.json import jldump, jlload
 from instructlab.sdg.utils.taxonomy import (
     leaf_node_to_samples,
     read_taxonomy_leaf_nodes,
@@ -38,6 +44,10 @@
 
 _SYS_PROMPT = "I am a Red Hat® Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.0-8b-base model. My primary role is to serve as a chat assistant."
 
+DEFAULT_CHUNK_WORD_COUNT = 1000
+DEFAULT_TAXONOMY_BASE = "empty"
+DEFAULT_SERVER_CTX_SIZE = 4096
+
 
 def _unescape(s):
     return bytes(s, "utf-8").decode("utf-8").strip()
@@ -115,20 +125,21 @@ def _gen_train_data(
 
 def _knowledge_seed_example_to_test_data(seed_example, system_prompt):
     res = []
-    for qna in seed_example["questions_and_answers"]:
-        user = qna["question"] + "\n" + seed_example["context"]
-        res.append(
-            {
-                "system": system_prompt,
-                "user": _unescape(user),
-                "assistant": _unescape(qna["answer"]),
-            }
-        )
+    for i in range(3):
+        idx = i + 1
+        user = seed_example[f"icl_query_{idx}"] + "\n" + seed_example["icl_document"]
+        test_sample = {
+            "user": _unescape(user),
+            "assistant": _unescape(seed_example[f"icl_response_{idx}"]),
+        }
+        if system_prompt:
+            test_sample["system"] = system_prompt
+        res.append(test_sample)
     return res
 
 
 def _gen_test_data(
-    leaf_nodes,
+    seed_examples,
     output_file_test,
     system_prompt,
 ):
@@ -137,30 +148,29 @@ def _gen_test_data(
     in instructlab/instructlab.
     """
     test_data = []
-    for _, leaf_node in leaf_nodes.items():
-        for seed_example in leaf_node:
-            if "questions_and_answers" in seed_example:
-                test_data.extend(
-                    _knowledge_seed_example_to_test_data(seed_example, system_prompt)
-                )
-                continue
+    for seed_example in seed_examples:
+        if "icl_query_1" in seed_example:
+            test_data.extend(
+                _knowledge_seed_example_to_test_data(seed_example, system_prompt)
+            )
+            continue
 
-            # skill seed example
+        # skill seed example
 
-            user = seed_example["instruction"]  # question
+        user = seed_example["seed_question"]  # question
 
-            if len(seed_example["input"]) > 0:
-                user += "\n" + seed_example["input"]  # context
+        if seed_example["leaf_node_type"] == "grounded_skill":
+            user += "\n" + seed_example["seed_context"]  # context
 
-            test_data.append(
-                {
-                    "system": system_prompt,
-                    "user": _unescape(user),
-                    "assistant": _unescape(seed_example["output"]),  # answer
-                }
-            )
+        test_sample = {
+            "user": _unescape(user),
+            "assistant": _unescape(seed_example["seed_response"]),  # answer
+        }
+        if system_prompt:
+            test_sample["system"] = system_prompt
+        test_data.append(test_sample)
 
-        jldump(test_data, output_file_test)
+    jldump(test_data, output_file_test)
 
 
 def _check_pipeline_dir(pipeline):
@@ -171,6 +181,31 @@ def _check_pipeline_dir(pipeline):
             )
 
 
+def _locate_docling_models():
+    # Search for the models in User and Site data directories
+    data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
+    data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
+
+    docling_model_path = None
+    sdg_models_path = docling_model_path
+    for d in data_dirs:
+        if os.path.exists(os.path.join(d, "models")):
+            sdg_models_path = os.path.join(d, "models")
+            break
+
+    if sdg_models_path is not None:
+        try:
+            with open(
+                os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8"
+            ) as file:
+                config = yaml.safe_load(file)
+                docling_model_path = config["models"][0]["path"]
+        except (FileNotFoundError, NotADirectoryError, PermissionError) as e:
+            logger.warning(f"unable to read docling models path from config.yaml {e}")
+
+    return docling_model_path
+
+
 def _context_init(
     client: openai.OpenAI,
     model_family: str,
@@ -208,23 +243,6 @@ def _sdg_init(ctx, pipeline):
     data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
     data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
 
-    docling_model_path = None
-    sdg_models_path = docling_model_path
-    for d in data_dirs:
-        if os.path.exists(os.path.join(d, "models")):
-            sdg_models_path = os.path.join(d, "models")
-            break
-
-    if sdg_models_path is not None:
-        try:
-            with open(
-                os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8"
-            ) as file:
-                config = yaml.safe_load(file)
-                docling_model_path = config["models"][0]["path"]
-        except (FileNotFoundError, NotADirectoryError, PermissionError) as e:
-            logger.warning(f"unable to read docling models path from config.yaml {e}")
-
     for d in data_dirs:
         pipeline_path = os.path.join(d, "pipelines", pipeline)
         if os.path.exists(pipeline_path):
@@ -256,12 +274,11 @@ def load_pipeline(yaml_basename):
         load_pipeline("knowledge.yaml"),
         load_pipeline("freeform_skills.yaml"),
         load_pipeline("grounded_skills.yaml"),
-        docling_model_path,
     )
 
 
 def _mixer_init(
-    ctx,
+    num_procs,
     output_dir,
     date_suffix,
     knowledge_auxiliary_inst,
@@ -275,91 +292,135 @@ def _mixer_init(
         output_dir,
         date_suffix,
         system_prompt,
-        ctx.dataset_num_procs,
+        num_procs,
         knowledge_auxiliary_inst,
     )
 
 
-# This is part of the public API, and used by instructlab.
-# TODO - parameter removal needs to be done in sync with a CLI change.
-# to be removed: logger
-def generate_data(
-    client: openai.OpenAI,
-    logger: logging.Logger = logger,  # pylint: disable=redefined-outer-name
-    system_prompt: Optional[str] = None,
-    use_legacy_pretraining_format: Optional[bool] = True,
-    model_family: Optional[str] = None,
-    model_name: Optional[str] = None,
-    num_cpus: Optional[int] = None,
-    num_instructions_to_generate: Optional[int] = 30,
-    taxonomy: Optional[str] = None,  # TODO rename to taxonomy_path to match config
-    taxonomy_base: Optional[str] = None,
-    output_dir: Optional[str] = None,
-    console_output=True,
-    yaml_rules: Optional[str] = None,
-    chunk_word_count=None,
-    server_ctx_size=None,
-    pipeline: Optional[str] = "simple",
-    batch_size: Optional[int] = None,
-    checkpoint_dir: Optional[str] = None,
-    max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS,
-) -> None:
-    """Generate data for training and testing a model.
+def _extract_leaf_node_path_and_type(sample):
+    leaf_node_path = sample.get("leaf_node_path", "unknown")
+    leaf_node_type = sample.get("leaf_node_type")
+    return leaf_node_path, leaf_node_type
 
-    This currently serves as the primary interface from the `ilab` CLI to the `sdg` library.
-    It is somewhat a transitionary measure, as this function existed back when all of the
-    functionality was embedded in the CLI. At some stage, we expect to evolve the CLI to
-    use the SDG library constructs directly, and this function will likely be removed.
 
-    Args:
-        pipeline: This argument may be either an alias defined in a user or site "data directory"
-                  or an alias defined by the sdg library ("simple", "full")(if the data directory has no matches),
-                  or an absolute path to a directory containing the pipeline YAML files.
-                  We expect three files to be present in this directory: "knowledge.yaml",
-                    "freeform_skills.yaml", and "grounded_skills.yaml".
+def preprocess_taxonomy(
+    taxonomy_dir,
+    output_dir,
+    chunk_word_count=DEFAULT_CHUNK_WORD_COUNT,
+    server_ctx_size=DEFAULT_SERVER_CTX_SIZE,
+    taxonomy_base=DEFAULT_TAXONOMY_BASE,
+    teacher_model_path: Optional[str] = None,
+    yaml_rules: Optional[str] = None,
+    test_output_file: Optional[str] = None,
+    system_prompt: Optional[str] = None,
+):
     """
-    generate_start = time.time()
+    Preprocess a taxonomy into input samples suitable for use with
+    data generation pipelines. This does the following steps:
+
+    - Determine changed leaf nodes in the taxonomy
+    - Retrieve knowledge documents for changed taxonomy leaf nodes
+    - Convert any non-markdown knowledge documents to markdown
+    - Write the Docling json and markdown outputs from this conversion to
+      disk for other processes to consume if needed.
+    - Chunk the converted knowledge documents to the desired chunk sizes.
+    - Turn the qna.yaml and knowledge documents into samples in the format
+      expected by the `simple` and `full` data generation pipelines shipped
+      in SDG.
+    - Write these samples to disk, with one file per taxonomy leaf node.
 
-    system_prompt = system_prompt if system_prompt is not None else _SYS_PROMPT
+    Args:
+        taxonomy_dir: The path to the taxonomy
+        output_dir: Where to write the samples create for use with data generation
+        test_output_file: Path to write the test samples jsonl file
+        chunk_word_count: The target number of words per document chunk
+        server_ctx_size: The maximum number of tokens the inference server used
+                         during data generation can handle
+        taxonomy_base: Determines how we calculate what has changed. This should
+                       be a git reference or the special value of 'empty' which
+                       means assume the entire taxonomy has changed.
+        teacher_model_path: Path to the teacher model on disk, which we'll use to
+                            load its tokenizer for use with document chunking.
+        yaml_rules: Path to a custom YAML rules file for YAML linting.
+        test_output_file: Path to write a file with generated test samples
+        system_prompt: System prompt to use when generating test samples
+
+    Returns:
+        List[str]: The list of output sample files written to disk.
 
-    # FIXME: remove this when ilab knows to pass batch_size=0 with llama.cpp
-    if batch_size is None:
-        batch_size = 0
+    """
+    logging.info("Converting taxonomy to samples")
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True)
+    output_files = []
 
-    if not os.path.exists(output_dir):
-        os.mkdir(output_dir)
+    if not (taxonomy_dir and os.path.exists(taxonomy_dir)):
+        raise GenerateException(f"Error: taxonomy ({taxonomy_dir}) does not exist.")
 
-    if not (taxonomy and os.path.exists(taxonomy)):
-        raise GenerateException(f"Error: taxonomy ({taxonomy}) does not exist.")
-
-    date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
-    document_output_dir = Path(output_dir) / f"documents-{date_suffix}"
+    document_output_dir = output_dir.joinpath("documents")
+    docling_model_path = _locate_docling_models()
 
     leaf_nodes = read_taxonomy_leaf_nodes(
-        taxonomy, taxonomy_base, yaml_rules, document_output_dir
+        taxonomy_dir, taxonomy_base, yaml_rules, document_output_dir
     )
     if not leaf_nodes:
         raise GenerateException("Error: No new leaf nodes found in the taxonomy.")
 
-    name = Path(model_name).stem  # Just in case it is a file path
-    output_file_messages = f"messages_{name}_{date_suffix}.jsonl"
-    output_file_test = f"test_{name}_{date_suffix}.jsonl"
-    output_file_train = f"train_{name}_{date_suffix}.jsonl"
+    # TODO: This is all a temporary hack here, as we either need to
+    # remove, deprecate, or otherwise determine the right way to
+    # support test samples
+    all_samples = []
+    for leaf_node in leaf_nodes.values():
+        leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_")
+        samples = leaf_node_to_samples(
+            leaf_node,
+            server_ctx_size,
+            chunk_word_count,
+            document_output_dir,
+            teacher_model_path,
+            docling_model_path=docling_model_path,
+        )
 
-    _gen_test_data(
-        leaf_nodes,
-        os.path.join(output_dir, output_file_test),
-        system_prompt,
-    )
+        if not samples:
+            raise GenerateException("Error: No samples found in leaf node.")
 
-    logger.debug(f"Generating to: {os.path.join(output_dir, output_file_test)}")
+        logger.debug("Samples: %s", samples)
+
+        output_file = output_dir.joinpath(f"{leaf_node_path}.jsonl")
+        all_samples.extend(samples)
+        jldump(samples, output_file)
+        output_files.append(str(output_file))
+
+    if test_output_file:
+        _gen_test_data(
+            all_samples,
+            test_output_file,
+            system_prompt,
+        )
+        logger.debug(f"Generating test data to: {test_output_file}")
+    logger.info("Taxonomy converted to samples and written to %s", output_dir)
+    return output_files
 
-    model_family = models.get_model_family(model_family, model_name)
 
+def generate_taxonomy(
+    client: openai.OpenAI,
+    input_dir: str,
+    output_dir: str,
+    logger: logging.Logger = logger,  # pylint: disable=redefined-outer-name
+    model_family: Optional[str] = None,
+    model_id: Optional[str] = None,
+    num_cpus: Optional[int] = None,
+    num_instructions_to_generate: Optional[int] = 30,
+    console_output=True,
+    pipeline: Optional[str] = "simple",
+    batch_size: Optional[int] = None,
+    checkpoint_dir: Optional[str] = None,
+    max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS,
+):
     ctx = _context_init(
         client,
         model_family,
-        model_name,
+        model_id,
         num_instructions_to_generate,
         checkpoint_dir,
         1,  # save_freq
@@ -368,20 +429,8 @@ def generate_data(
         max_num_tokens=max_num_tokens,
     )
 
-    knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe, docling_model_path = (
-        _sdg_init(ctx, pipeline)
-    )
-
-    # Make sure checkpointing is disabled (we don't want this pipeline to load checkpoints from the main pipeline)
-    mmlu_ctx = dataclasses.replace(ctx, checkpoint_dir=None)
-    mmlu_bench_pipe = mmlubench_pipe_init(mmlu_ctx)
-
-    mixer = _mixer_init(
-        ctx,
-        output_dir,
-        date_suffix,
-        knowledge_pipe.auxiliary_inst,
-        system_prompt,
+    knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe = _sdg_init(
+        ctx, pipeline
     )
 
     if console_output:
@@ -389,76 +438,292 @@ def generate_data(
             "Synthesizing new instructions. If you aren't satisfied with the generated instructions, interrupt training (Ctrl-C) and try adjusting your YAML files. Adding more examples may help."
         )
 
-    generated_data = []
-    empty_sdg_leaf_nodes = []
-    for leaf_node in leaf_nodes.values():
-        is_knowledge = False
-        leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_")
-        samples = leaf_node_to_samples(
-            leaf_node,  # pylint: disable=duplicate-code
-            server_ctx_size,
-            chunk_word_count,
-            document_output_dir,
-            model_name,
-            docling_model_path=docling_model_path,
-        )
+    input_files = glob.glob(f"{input_dir}/*.jsonl")
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True)
 
+    empty_input_files = []
+    for input_file in input_files:
+        logger.debug("Generating data from input file: %s", input_file)
+        samples = jlload(input_file)
         if not samples:
-            raise GenerateException("Error: No samples found in leaf node.")
-
-        if "document" in samples.column_names:
+            raise GenerateException(
+                "Error: No samples found in input file {input_file}"
+            )
+        # For now we assume every sample in the file is the same type
+        first_sample = samples[0]
+        leaf_node_path, leaf_node_type = _extract_leaf_node_path_and_type(first_sample)
+        if leaf_node_type == "knowledge":
             pipe = knowledge_pipe
-            is_knowledge = True
-
-        elif "seed_context" in samples.column_names:
+        elif leaf_node_type == "grounded_skill":
             pipe = grounded_skills_pipe
-
         else:
             pipe = freeform_skills_pipe
 
-        logger.debug("Samples: %s", samples)
+        samples_ds = Dataset.from_list(samples)
+        logger.debug("Generating from samples: %s", samples_ds)
 
-        new_generated_data = pipe.generate(samples, leaf_node_path)
+        new_generated_data = pipe.generate(samples_ds, leaf_node_path)
         if len(new_generated_data) == 0:
-            empty_sdg_leaf_nodes.append(leaf_node_path)
-            logger.warning("Empty dataset for qna node: %s", leaf_node_path)
+            empty_input_files.append(input_file)
+            logger.warning("Empty generated dataset for input file: %s", input_file)
             continue
-        generated_data.append(new_generated_data)
 
-        logger.info("Generated %d samples", len(generated_data))
-        logger.debug("Generated data: %s", generated_data)
+        output_file = os.path.join(output_dir, os.path.basename(input_file))
+        jldump(new_generated_data, output_file)
+        logger.info("Generated %d samples", len(new_generated_data))
+        logger.debug("Generated data: %s", new_generated_data)
+
+    if len(empty_input_files) > 0:
+        logger.warning(
+            "Input sample files with empty sdg output: {}".format(
+                " ".join(empty_input_files)
+            )
+        )
+
+
+def generate_taxonomy_eval(
+    client: openai.OpenAI,
+    input_dir: str,
+    output_dir: str,
+    date_suffix: str,
+    model_family: Optional[str] = None,
+    model_id: Optional[str] = None,
+    num_cpus: Optional[int] = None,
+    num_instructions_to_generate: Optional[int] = 30,
+    batch_size: Optional[int] = None,
+    max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS,
+):
+    ctx = _context_init(
+        client,
+        model_family,
+        model_id,
+        num_instructions_to_generate,
+        None,  # disable checkpoints for eval pipeline
+        1,  # save_freq
+        batch_size=batch_size,
+        batch_num_workers=num_cpus,
+        max_num_tokens=max_num_tokens,
+    )
+    mmlu_bench_pipe = mmlubench_pipe_init(ctx)
+
+    input_files = glob.glob(f"{input_dir}/*.jsonl")
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True)
+
+    for input_file in input_files:
+        logger.debug("Generating eval data from input file: %s", input_file)
+        samples = jlload(input_file)
+        if not samples:
+            raise GenerateException(
+                "Error: No samples found in input file {input_file}"
+            )
+        samples_ds = Dataset.from_list(samples)
+        # For now we assume every sample in the file is the same type
+        first_sample = samples[0]
+        leaf_node_path, leaf_node_type = _extract_leaf_node_path_and_type(first_sample)
+        is_knowledge = False
+        if leaf_node_type == "knowledge":
+            is_knowledge = True
 
         if is_knowledge:
-            # generate mmlubench data for the current leaf node
             generate_eval_task_data(
                 mmlu_bench_pipe,
                 leaf_node_path,
-                samples,
+                samples_ds,
                 output_dir,
                 date_suffix,
             )
 
+
+def postprocess_taxonomy(
+    input_dir: str,
+    output_dir: str,
+    date_suffix: str,
+    pipeline: Optional[str] = "simple",
+    num_procs: Optional[int] = PipelineContext.DEFAULT_DATASET_NUM_PROCS,
+    system_prompt: Optional[str] = _SYS_PROMPT,
+    use_legacy_pretraining_format: Optional[bool] = True,
+):
+    knowledge_pipe, _, _ = _sdg_init(None, pipeline)
+    mixer = _mixer_init(
+        num_procs,
+        output_dir,
+        date_suffix,
+        knowledge_pipe.auxiliary_inst,
+        system_prompt,
+    )
+
+    input_files = glob.glob(f"{input_dir}/*.jsonl")
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True)
+
+    output_file_messages = f"messages_{date_suffix}.jsonl"
+    output_file_train = f"train_{date_suffix}.jsonl"
+
+    all_generated_data = []
+    for input_file in input_files:
+        logger.debug(
+            "Postprocessing generated taxonomy date in input file: %s", input_file
+        )
+        samples = jlload(input_file)
+        if not samples:
+            raise GenerateException(
+                "Error: No samples found in input file {input_file}"
+            )
+        # For now we assume every sample in the file is the same type
+        first_sample = samples[0]
+        leaf_node_path, leaf_node_type = _extract_leaf_node_path_and_type(first_sample)
+        is_knowledge = False
+        if leaf_node_type == "knowledge":
+            is_knowledge = True
+
+        samples_ds = Dataset.from_list(samples)
+        logger.debug("Postprocessing from samples: %s", samples_ds)
+        all_generated_data.append(samples_ds)
+
         mixer.collect(
             leaf_node_path,
-            new_generated_data,
+            samples_ds,
             is_knowledge,
             use_legacy_pretraining_format,
         )
 
     _gen_train_data(
-        generated_data,
+        all_generated_data,
         os.path.join(output_dir, output_file_train),
         os.path.join(output_dir, output_file_messages),
         system_prompt,
     )
 
-    mixer.generate()
+    mixer.write_recipes()
+
+
+def mix_datasets(
+    recipe_file: str,
+    output_file: str,
+    num_proc: Optional[int] = 8,
+):
+    recipe = Recipe(recipe_file)
+    if recipe.datasets:
+        recipe.save_mixed_dataset(output_file, num_proc)
+    else:
+        logger.info("Not mixing empty recipe file: %s", recipe_file)
+
+
+# This is part of the public API, and used by instructlab.
+# TODO - parameter removal needs to be done in sync with a CLI change.
+# to be removed: logger
+def generate_data(
+    client: openai.OpenAI,
+    logger: logging.Logger = logger,  # pylint: disable=redefined-outer-name
+    system_prompt: Optional[str] = None,
+    use_legacy_pretraining_format: Optional[bool] = True,
+    model_family: Optional[str] = None,
+    model_name: Optional[str] = None,
+    num_cpus: Optional[int] = None,
+    num_instructions_to_generate: Optional[int] = 30,
+    taxonomy: Optional[str] = None,  # TODO rename to taxonomy_path to match config
+    taxonomy_base: Optional[str] = None,
+    output_dir: Optional[str] = None,
+    console_output=True,
+    yaml_rules: Optional[str] = None,
+    chunk_word_count=None,
+    server_ctx_size=None,
+    pipeline: Optional[str] = "simple",
+    batch_size: Optional[int] = None,
+    checkpoint_dir: Optional[str] = None,
+    max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS,
+) -> None:
+    """Generate data for training and testing a model.
+
+    This currently serves as the primary interface from the `ilab` CLI to the `sdg` library.
+    It is somewhat a transitionary measure, as this function existed back when all of the
+    functionality was embedded in the CLI. At some stage, we expect to evolve the CLI to
+    use the SDG library constructs directly, and this function will likely be removed.
+
+    Args:
+        pipeline: This argument may be either an alias defined in a user or site "data directory"
+                  or an alias defined by the sdg library ("simple", "full")(if the data directory has no matches),
+                  or an absolute path to a directory containing the pipeline YAML files.
+                  We expect three files to be present in this directory: "knowledge.yaml",
+                    "freeform_skills.yaml", and "grounded_skills.yaml".
+    """
+    generate_start = time.time()
+
+    system_prompt = system_prompt if system_prompt is not None else _SYS_PROMPT
+
+    # FIXME: remove this when ilab knows to pass batch_size=0 with llama.cpp
+    if batch_size is None:
+        batch_size = 0
+
+    date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_file_test = output_dir.joinpath(f"test_{date_suffix}.jsonl")
+    preprocessed_dir = output_dir.joinpath(f"preprocessed_{date_suffix}")
+    generated_dir = output_dir.joinpath(f"generated_{date_suffix}")
+
+    # This writes samples to disk in our output_dir and returns the
+    # list of files created
+    preprocess_taxonomy(
+        taxonomy,
+        output_dir=preprocessed_dir,
+        chunk_word_count=chunk_word_count,
+        server_ctx_size=server_ctx_size,
+        taxonomy_base=taxonomy_base,
+        teacher_model_path=model_name,
+        yaml_rules=yaml_rules,
+        test_output_file=output_file_test,
+        system_prompt=system_prompt,
+    )
+
+    generate_taxonomy(
+        client,
+        input_dir=preprocessed_dir,
+        output_dir=generated_dir,
+        logger=logger,
+        model_family=model_family,
+        model_id=model_name,
+        num_cpus=num_cpus,
+        num_instructions_to_generate=num_instructions_to_generate,
+        console_output=console_output,
+        pipeline=pipeline,
+        batch_size=batch_size,
+        checkpoint_dir=checkpoint_dir,
+        max_num_tokens=max_num_tokens,
+    )
+
+    generate_taxonomy_eval(
+        input_dir=preprocessed_dir,
+        output_dir=output_dir,
+        date_suffix=date_suffix,
+        client=client,
+        model_family=model_family,
+        model_id=model_name,
+        num_cpus=num_cpus,
+        num_instructions_to_generate=num_instructions_to_generate,
+        batch_size=batch_size,
+        max_num_tokens=max_num_tokens,
+    )
+
+    postprocess_taxonomy(
+        input_dir=generated_dir,
+        output_dir=output_dir,
+        date_suffix=date_suffix,
+        pipeline=pipeline,
+        system_prompt=system_prompt,
+        use_legacy_pretraining_format=use_legacy_pretraining_format,
+    )
+
+    mix_datasets(
+        recipe_file=f"{output_dir}/skills_recipe_{date_suffix}.yaml",
+        output_file=f"{output_dir}/skills_train_msgs_{date_suffix}.jsonl",
+    )
+    mix_datasets(
+        recipe_file=f"{output_dir}/knowledge_recipe_{date_suffix}.yaml",
+        output_file=f"{output_dir}/knowledge_train_msgs_{date_suffix}.jsonl",
+    )
 
     generate_duration = time.time() - generate_start
     logger.info(f"Generation took {generate_duration:.2f}s")
-    if len(empty_sdg_leaf_nodes) > 0:
-        logger.warning(
-            "Leaf nodes with empty sdg output: {}".format(
-                " ".join(empty_sdg_leaf_nodes)
-            )
-        )
diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
index 59613a8e..ce362668 100644
--- a/src/instructlab/sdg/pipeline.py
+++ b/src/instructlab/sdg/pipeline.py
@@ -16,7 +16,7 @@
 
 # First Party
 from instructlab.sdg.checkpointing import Checkpointer
-from instructlab.sdg.utils import pandas
+from instructlab.sdg.utils import models, pandas
 
 # Local
 from .blocks import llmblock
@@ -71,6 +71,9 @@ class PipelineContext:  # pylint: disable=too-many-instance-attributes
     batch_size: int = DEFAULT_BATCH_SIZE
     batch_num_workers: Optional[int] = None
 
+    def __post_init__(self):
+        self.model_family = models.get_model_family(self.model_family, self.model_id)
+
     @property
     def batching_enabled(self) -> bool:
         """Batching is enabled IFF the batch size is specified and the number of
diff --git a/src/instructlab/sdg/utils/json.py b/src/instructlab/sdg/utils/json.py
index 041d817b..1ec0b70a 100644
--- a/src/instructlab/sdg/utils/json.py
+++ b/src/instructlab/sdg/utils/json.py
@@ -60,3 +60,9 @@ def jldump(data: Iterable[Any], out: str | io.IOBase) -> None:
         for entry in data:
             json.dump(entry, outfile, ensure_ascii=False)
             outfile.write("\n")
+
+
+def jlload(f, mode="r"):
+    """Load a .jsonl file into a list of dictionaries."""
+    with _make_r_io_base(f, mode) as f_:
+        return [json.loads(l) for l in f_.read().splitlines()]
diff --git a/src/instructlab/sdg/utils/models.py b/src/instructlab/sdg/utils/models.py
index da01421b..b6375e57 100644
--- a/src/instructlab/sdg/utils/models.py
+++ b/src/instructlab/sdg/utils/models.py
@@ -22,5 +22,10 @@ def get_model_family(model_family, model_path):
         return model_family
 
     # Try to guess the model family based on the model's filename
-    guess = re.match(r"^\w*", os.path.basename(model_path)).group(0).lower()
-    return guess if guess in registry else DEFAULT_MODEL_FAMILY
+    if model_path:
+        guess = re.match(r"^\w*", os.path.basename(model_path)).group(0).lower()
+        if guess in registry:
+            return guess
+
+    # Nothing was found, so just return the default
+    return DEFAULT_MODEL_FAMILY
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index 2a23072f..ed0d7940 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -418,6 +418,7 @@ def map_chunks_to_icls(chunks: List, leaf_node: Dict) -> Dataset:
                 "icl_document": icl_.get("context", ""),
                 "document_outline": icl_.get("document_outline", ""),
                 "domain": domain,
+                "leaf_node_type": "knowledge",
             }
 
             qna_pairs = icl_.get("questions_and_answers", [])
@@ -431,7 +432,7 @@ def map_chunks_to_icls(chunks: List, leaf_node: Dict) -> Dataset:
 
             chunked_dataset.append(record)
 
-    return Dataset.from_list(chunked_dataset)
+    return chunked_dataset
 
 
 def _knowledge_leaf_node_to_samples(
@@ -464,12 +465,23 @@ def _skill_leaf_node_to_samples(leaf_node):
     for i in range(len(leaf_node)):
         samples.append({})
         samples[-1]["task_description"] = leaf_node[i]["task_description"]
+        sample_type = "freeform_skill"
         if leaf_node[i].get("input"):
+            sample_type = "grounded_skill"
             samples[-1]["seed_context"] = leaf_node[i]["input"]
         samples[-1]["seed_question"] = leaf_node[i]["instruction"]
         samples[-1]["seed_response"] = leaf_node[i]["output"]
+        samples[-1]["leaf_node_type"] = sample_type
 
-    return Dataset.from_list(samples)
+    return samples
+
+
+def _enrich_metadata(samples, leaf_node):
+    leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_")
+    for i, sample in enumerate(samples):
+        sample["leaf_node_path"] = leaf_node_path
+        samples[i] = sample
+    return samples
 
 
 def leaf_node_to_samples(
@@ -480,15 +492,17 @@ def leaf_node_to_samples(
     model_name,
     docling_model_path=None,
 ):
-    if not leaf_node:
-        return []
-    if leaf_node[0].get("documents"):
-        return _knowledge_leaf_node_to_samples(
-            leaf_node,  # pylint: disable=duplicate-code
+    samples = []
+    if leaf_node and leaf_node[0].get("documents"):
+        samples = _knowledge_leaf_node_to_samples(
+            leaf_node,
             server_ctx_size,
             chunk_word_count,
             document_output_dir,
             model_name,
             docling_model_path,
         )
-    return _skill_leaf_node_to_samples(leaf_node)
+    elif leaf_node:
+        samples = _skill_leaf_node_to_samples(leaf_node)
+    samples = _enrich_metadata(samples, leaf_node)
+    return Dataset.from_list(samples)
diff --git a/tests/conftest.py b/tests/conftest.py
index ed3fd8c4..be3f249a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -30,7 +30,7 @@ def testdata_path() -> typing.Generator[pathlib.Path, None, None]:
 
 def get_ctx(**kwargs) -> PipelineContext:
     kwargs.setdefault("client", mock.MagicMock())
-    kwargs.setdefault("model_family", "test")
+    kwargs.setdefault("model_family", "merlinite")
     kwargs.setdefault("model_id", "test-model")
     kwargs.setdefault("num_instructions_to_generate", 10)
     kwargs.setdefault("dataset_num_procs", 1)
diff --git a/tests/functional/__init__.py b/tests/functional/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/functional/test_granular_api.py b/tests/functional/test_granular_api.py
new file mode 100644
index 00000000..79207368
--- /dev/null
+++ b/tests/functional/test_granular_api.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Standard
+from datetime import datetime
+from unittest.mock import MagicMock
+import glob
+import pathlib
+
+# Third Party
+import git
+
+# First Party
+from instructlab.sdg import BlockRegistry
+from instructlab.sdg.generate_data import (
+    generate_taxonomy,
+    mix_datasets,
+    postprocess_taxonomy,
+    preprocess_taxonomy,
+)
+
+# Local
+from ..mockllmblock import MockLLMBlock
+
+
+def _clone_instructlab_taxonomy(taxonomy_dir):
+    taxonomy_repo_url = "https://github.com/instructlab/taxonomy"
+    taxonomy_commit = "dfa3afaf26f40f923cf758389719619ec9b1ddb1"
+    repo = git.Repo.clone_from(taxonomy_repo_url, taxonomy_dir, no_checkout=True)
+    repo.git.checkout(taxonomy_commit)
+
+
+def test_granular_api_end_to_end(testdata_path: pathlib.Path, tmp_path: pathlib.Path):
+    # Registry our mock block so we can reference it in pipelines
+    BlockRegistry.register("MockLLMBlock")(MockLLMBlock)
+
+    # Clone a taxonomy and edit 1 file in it
+    taxonomy_dir = tmp_path.joinpath("taxonomy")
+    _clone_instructlab_taxonomy(taxonomy_dir)
+    changed_qna_yaml = taxonomy_dir.joinpath(
+        "knowledge", "science", "animals", "birds", "black_capped_chickadee", "qna.yaml"
+    )
+    with open(changed_qna_yaml, "a", encoding="utf-8") as file:
+        file.write("")
+
+    pipeline_dir = testdata_path.joinpath("mock_pipelines")
+    date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
+
+    preprocessed_dir = tmp_path.joinpath("preprocessed")
+    preprocess_taxonomy(
+        taxonomy_dir=taxonomy_dir,
+        output_dir=preprocessed_dir,
+    )
+    chickadee_docs = glob.glob(
+        str(
+            preprocessed_dir.joinpath(
+                "documents", "knowledge_science_*", "chickadee.md"
+            )
+        )
+    )
+    assert chickadee_docs
+    chickadee_samples_path = preprocessed_dir.joinpath(
+        "knowledge_science_animals_birds_black_capped_chickadee.jsonl"
+    )
+    assert chickadee_samples_path.is_file()
+
+    client = MagicMock()
+    client.server_supports_batched = False
+    generated_dir = tmp_path.joinpath("generated")
+    generate_taxonomy(
+        client=client,
+        input_dir=preprocessed_dir,
+        output_dir=generated_dir,
+        pipeline=pipeline_dir,
+    )
+    generated_chickadee_samples_path = generated_dir.joinpath(
+        "knowledge_science_animals_birds_black_capped_chickadee.jsonl"
+    )
+    assert generated_chickadee_samples_path.is_file()
+
+    postprocessed_dir = tmp_path.joinpath("postprocessed")
+    postprocess_taxonomy(
+        input_dir=generated_dir,
+        output_dir=postprocessed_dir,
+        date_suffix=date_suffix,
+        pipeline=pipeline_dir,
+    )
+    knowledge_recipe_file = postprocessed_dir.joinpath(
+        f"knowledge_recipe_{date_suffix}.yaml"
+    )
+    assert knowledge_recipe_file.is_file()
+    skills_recipe_file = postprocessed_dir.joinpath(f"skills_recipe_{date_suffix}.yaml")
+    assert skills_recipe_file.is_file()
+
+    mixed_skills_output_file = (
+        f"{postprocessed_dir}/skills_train_msgs_{date_suffix}.jsonl"
+    )
+    mix_datasets(
+        recipe_file=f"{postprocessed_dir}/skills_recipe_{date_suffix}.yaml",
+        output_file=mixed_skills_output_file,
+    )
+    assert pathlib.Path(mixed_skills_output_file).is_file()
diff --git a/tests/mockllmblock.py b/tests/mockllmblock.py
new file mode 100644
index 00000000..744cd6d5
--- /dev/null
+++ b/tests/mockllmblock.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Standard
+import random
+import string
+
+# Third Party
+from datasets import Dataset
+
+# First Party
+from instructlab.sdg import LLMBlock
+
+
+def _random_string(size):
+    return "".join(random.choices(string.ascii_lowercase, k=size))
+
+
+def _add_mocked_cols(sample, block_name):
+    match block_name:
+        case "gen_questions" | "gen_grounded_questions":
+            sample["question"] = f"Is this a question {_random_string(8)}?"
+        case "eval_questions" | "eval_grounded_questions":
+            sample["evaluation"] = "This is an evaluation."
+            sample["score"] = "1"
+        case "gen_responses" | "gen_grounded_responses":
+            sample["response"] = "This is a response."
+        case "evaluate_qa_pair" | "evaluate_grounded_qa_pair":
+            sample["evaluation"] = "This is an evaluation."
+            sample["score"] = "2"
+        case "gen_contexts":
+            sample["context"] = f"This is a context {_random_string(8)}."
+        case "gen_spellcheck":
+            sample["spellcheck"] = sample["document"]
+        case "gen_knowledge":
+            sample["question"] = f"Is this a question {_random_string(8)}?"
+            sample["response"] = "This is a response."
+        case "eval_faithfulness_qa_pair":
+            sample["explanation"] = "This is an explanation."
+            sample["judgment"] = "YES"
+        case "eval_relevancy_qa_pair":
+            sample["feedback"] = "This is some feedback."
+            sample["score"] = "2"
+        case "eval_verify_question":
+            sample["explanation"] = "This is an explanation."
+            sample["rating"] = "1"
+        case _:
+            raise Exception(
+                f"Received an un-mocked LLMBlock: {block_name}. Add code in {__file__} to handle this block."
+            )
+    return sample
+
+
+class MockLLMBlock(LLMBlock):
+    def generate(self, samples: Dataset):
+        return samples.map(_add_mocked_cols, fn_kwargs={"block_name": self.block_name})
diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py
index c2968029..0116253e 100644
--- a/tests/test_generate_data.py
+++ b/tests/test_generate_data.py
@@ -21,7 +21,12 @@
 
 # First Party
 from instructlab.sdg import LLMBlock, PipelineContext
-from instructlab.sdg.generate_data import _context_init, _sdg_init, generate_data
+from instructlab.sdg.generate_data import (
+    _context_init,
+    _locate_docling_models,
+    _sdg_init,
+    generate_data,
+)
 
 TEST_SYS_PROMPT = "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant."
 
@@ -85,7 +90,7 @@ def validate_messages_dataset(dataset_file_name, expected_samples):
 
 def validate_skill_leaf_node_dataset(dataset_file_name):
     ds = load_dataset("json", data_files=dataset_file_name, split="train")
-    assert len(ds.features) == 7
+    assert len(ds.features) == 9
     features = [
         "task_description",
         "seed_context",
@@ -93,6 +98,8 @@ def validate_skill_leaf_node_dataset(dataset_file_name):
         "seed_response",
         "output",
         "id",
+        "leaf_node_path",
+        "leaf_node_type",
     ]
     for feature in features:
         assert feature in ds.features
@@ -518,7 +525,8 @@ def test_generate(self):
             )
         mocked_logger.warning.assert_called()
         assert re.search(
-            "empty sdg output: knowledge_new", mocked_logger.warning.call_args.args[0]
+            "empty sdg output: .+knowledge_new.jsonl",
+            mocked_logger.warning.call_args.args[0],
         )
 
     def teardown(self) -> None:
@@ -567,35 +575,15 @@ def test_context_init_batch_size_optional():
     assert ctx.batch_size == 20
 
 
-def test_sdg_init_docling_path_config_found(testdata_path):
+def test_locate_docling_models_config_found(testdata_path):
     with patch.dict(os.environ):
         os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("mock_xdg_data_dir"))
-        ctx = _context_init(
-            None,
-            "mixtral",
-            "foo.bar",
-            1,
-            "/checkpoint/dir",
-            1,
-            batch_size=20,
-            batch_num_workers=32,
-        )
-        _, _, _, docling_model_path = _sdg_init(ctx, "full")
+        docling_model_path = _locate_docling_models()
         assert docling_model_path == "/mock/docling-models"
 
 
-def test_sdg_init_docling_path_config_not_found(testdata_path):
+def test_locate_docling_models_config_not_found(testdata_path):
     with patch.dict(os.environ):
         os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("nonexistent_dir"))
-        ctx = _context_init(
-            None,
-            "mixtral",
-            "foo.bar",
-            1,
-            "/checkpoint/dir",
-            1,
-            batch_size=20,
-            batch_num_workers=32,
-        )
-        _, _, _, docling_model_path = _sdg_init(ctx, "full")
+        docling_model_path = _locate_docling_models()
         assert docling_model_path is None
diff --git a/tests/test_models.py b/tests/test_models.py
index e3755553..ef4dd412 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -64,3 +64,6 @@ def test_unknown_model_family(self):
                 "foobar", "./models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"
             )
         assert "Unknown model family: foobar" in str(exc.value)
+
+    def test_none_args(self):
+        assert models.get_model_family(None, None) == "merlinite"
diff --git a/tests/testdata/mock_pipelines/freeform_skills.yaml b/tests/testdata/mock_pipelines/freeform_skills.yaml
new file mode 100644
index 00000000..9f9043cb
--- /dev/null
+++ b/tests/testdata/mock_pipelines/freeform_skills.yaml
@@ -0,0 +1,53 @@
+version: "1.0"
+blocks:
+  - name: gen_questions
+    type: MockLLMBlock
+    config:
+      config_path: ../../../src/instructlab/sdg/configs/skills/freeform_questions.yaml
+      output_cols:
+        - question
+      batch_kwargs:
+        num_samples: 30
+    drop_duplicates:
+      - question
+  - name: eval_questions
+    type: MockLLMBlock
+    config:
+      config_path: ../../../src/instructlab/sdg/configs/skills/evaluate_freeform_questions.yaml
+      output_cols:
+        - evaluation
+        - score
+  - name: filter_questions
+    type: FilterByValueBlock
+    config:
+      filter_column: score
+      filter_value: 1.0
+      operation: eq
+      convert_dtype: float
+    drop_columns:
+      - evaluation
+      - score
+      - num_samples
+  - name: gen_responses
+    type: MockLLMBlock
+    config:
+      config_path: ../../../src/instructlab/sdg/configs/skills/freeform_responses.yaml
+      output_cols:
+        - response
+  - name: evaluate_qa_pair
+    type: MockLLMBlock
+    config:
+      config_path: ../../../src/instructlab/sdg/configs/skills/evaluate_freeform_pair.yaml
+      output_cols:
+        - evaluation
+        - score
+  - name: filter_qa_pair
+    type: FilterByValueBlock
+    config:
+      filter_column: score
+      filter_value: 2.0
+      operation: ge
+      convert_dtype: float
+    drop_columns:
+      - evaluation
+      - score
diff --git a/tests/testdata/mock_pipelines/grounded_skills.yaml b/tests/testdata/mock_pipelines/grounded_skills.yaml
new file mode 100644
index 00000000..4aceaf0f
--- /dev/null
+++ b/tests/testdata/mock_pipelines/grounded_skills.yaml
@@ -0,0 +1,70 @@
+version: "1.0"
+blocks:
+  - name: gen_contexts
+    type: MockLLMBlock
+    config:
+      config_path: ../../../src/instructlab/sdg/configs/skills/contexts.yaml
+      output_cols:
+        - context
+      gen_kwargs:
+        temperature: 0.7
+        max_tokens: 2048
+        n: 10
+        seed: 42
+    drop_duplicates:
+      - context
+  - name: gen_grounded_questions
+    type: MockLLMBlock
+    config:
+      config_path: ../../../src/instructlab/sdg/configs/skills/grounded_questions.yaml
+      output_cols:
+        - question
+      batch_kwargs:
+        num_samples: 3
+    drop_duplicates:
+      - question
+  - name: eval_grounded_questions
+    type: MockLLMBlock
+    config:
+      config_path: ../../../src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml
+      output_cols:
+        - evaluation
+        - score
+  - name: filter_grounded_questions
+    type: FilterByValueBlock
+    config:
+      filter_column: score
+      filter_value: 1.0
+      operation: eq
+      convert_dtype: float
+    drop_columns:
+      - evaluation
+      - score
+      - num_samples
+  - name: gen_grounded_responses
+    type: MockLLMBlock
+    config:
+      config_path: ../../../src/instructlab/sdg/configs/skills/grounded_responses.yaml
+      output_cols:
+        - response
+  - name: evaluate_grounded_qa_pair
+    type: MockLLMBlock
+    config:
+      config_path: ../../../src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml
+      output_cols:
+        - evaluation
+        - score
+  - name: filter_grounded_qa_pair
+    type: FilterByValueBlock
+    config:
+      filter_column: score
+      filter_value: 2.0
+      operation: ge
+      convert_dtype: float
+  - name: combine_question_and_context
+    type: CombineColumnsBlock
+    config:
+      columns:
+        - context
+        - question
+      output_col: question
diff --git a/tests/testdata/mock_pipelines/knowledge.yaml b/tests/testdata/mock_pipelines/knowledge.yaml
new file mode 100644
index 00000000..1eb2d066
--- /dev/null
+++ b/tests/testdata/mock_pipelines/knowledge.yaml
@@ -0,0 +1,113 @@
+version: "1.0"
+blocks:
+  - name: duplicate_document_col
+    type: DuplicateColumnsBlock
+    config:
+      columns_map:
+        document: base_document
+
+  - name: gen_spellcheck
+    type: MockLLMBlock
+    config:
+      config_path: ../../../src/instructlab/sdg/configs/knowledge/spellcheck.yaml
+      output_cols:
+        - spellcheck
+      gen_kwargs:
+        max_tokens: 2048
+
+  - name: flatten_auxiliary_columns
+    type: FlattenColumnsBlock
+    config:
+      var_cols:
+        - spellcheck
+        - base_document
+      value_name: corrected_document
+      var_name: dataset_type
+
+  - name: rename_to_document_column
+    type: RenameColumnsBlock
+    config:
+      columns_map:
+        document: raw_document
+        corrected_document: document
+
+  - name: gen_knowledge
+    type: MockLLMBlock
+    config:
+      config_path: ../../../src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml
+      output_cols:
+        - question
+        - response
+      parser_kwargs:
+        parser_name: custom
+        parsing_pattern: '\[(?:Question|QUESTION)\]\s*(.*?)\s*\[(?:Answer|ANSWER)\]\s*(.*?)\s*(?=\[(?:Question|QUESTION)\]|$)'
+        parser_cleanup_tags:
+          - "[END]"
+          - "[End]"
+      gen_kwargs:
+        max_tokens: 2048
+    drop_duplicates:
+      - question
+  - name: eval_faithfulness_qa_pair
+    type: MockLLMBlock
+    config:
+      config_path: ../../../src/instructlab/sdg/configs/knowledge/evaluate_faithfulness.yaml
+      output_cols:
+        - explanation
+        - judgment
+      gen_kwargs:
+        max_tokens: 2048
+  - name: filter_faithfulness
+    type: FilterByValueBlock
+    config:
+      filter_column: judgment
+      filter_value: "YES"
+      operation: eq
+    drop_columns:
+      - judgment
+      - explanation
+  - name: eval_relevancy_qa_pair
+    type: MockLLMBlock
+    config:
+      config_path: ../../../src/instructlab/sdg/configs/knowledge/evaluate_relevancy.yaml
+      output_cols:
+        - feedback
+        - score
+      gen_kwargs:
+        max_tokens: 2048
+  - name: filter_relevancy
+    type: FilterByValueBlock
+    config:
+      filter_column: score
+      filter_value: 2.0
+      operation: eq
+      convert_dtype: float
+    drop_columns:
+      - feedback
+      - score
+  - name: eval_verify_question
+    type: MockLLMBlock
+    config:
+      config_path: ../../../src/instructlab/sdg/configs/knowledge/evaluate_question.yaml
+      output_cols:
+        - explanation
+        - rating
+      gen_kwargs:
+        max_tokens: 2048
+  - name: filter_verify_question
+    type: FilterByValueBlock
+    config:
+      filter_column: rating
+      filter_value: 1.0
+      operation: eq
+      convert_dtype: float
+    drop_columns:
+      - explanation
+      - rating
+      - __index_level_0__
+
+datamixing:
+  auxiliary_instructions:
+    spellcheck:
+      - Correct any spelling errors in the document and output the corrected version.
+      - Rewrite the document to remove any spelling errors.

From 6c8544e5abdeb73c6675264298886e906a610c23 Mon Sep 17 00:00:00 2001
From: Ben Browning <ben324@gmail.com>
Date: Wed, 15 Jan 2025 13:16:17 -0500
Subject: [PATCH 2/3] Parameterize # of seed examples when converting to test
 data

Instead of hardcoding this to always be 3, add a parameter with a default of 3 when converting our seed examples to the test output dataset.

Co-authored-by: Aakanksha Duggal <aduggal@redhat.com>
Signed-off-by: Ben Browning <ben324@gmail.com>
---
 src/instructlab/sdg/generate_data.py  | 4 ++--
 tests/functional/test_granular_api.py | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index a14a190a..a94683a4 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -123,9 +123,9 @@ def _gen_train_data(
     jldump(messages_data, output_file_messages)
 
 
-def _knowledge_seed_example_to_test_data(seed_example, system_prompt):
+def _knowledge_seed_example_to_test_data(seed_example, system_prompt, num_iterations=3):
     res = []
-    for i in range(3):
+    for i in range(num_iterations):
         idx = i + 1
         user = seed_example[f"icl_query_{idx}"] + "\n" + seed_example["icl_document"]
         test_sample = {
diff --git a/tests/functional/test_granular_api.py b/tests/functional/test_granular_api.py
index 79207368..33de8a25 100644
--- a/tests/functional/test_granular_api.py
+++ b/tests/functional/test_granular_api.py
@@ -46,9 +46,11 @@ def test_granular_api_end_to_end(testdata_path: pathlib.Path, tmp_path: pathlib.
     date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
 
     preprocessed_dir = tmp_path.joinpath("preprocessed")
+    teacher_model_path = testdata_path.joinpath("models/instructlab/granite-7b-lab")
     preprocess_taxonomy(
         taxonomy_dir=taxonomy_dir,
         output_dir=preprocessed_dir,
+        teacher_model_path=teacher_model_path,
     )
     chickadee_docs = glob.glob(
         str(

From 1f9394d6dfe890310c29befc4c4ad0461635351c Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Wed, 15 Jan 2025 14:50:26 -0500
Subject: [PATCH 3/3] Add examples for how to use data mixing

This adds a new docs/examples/mix_datasets folders with a couple of
example recipes, two sample datasets, and an example_mixing.py Python
script to show how to mix datasets.

This also adds a test_examples.py file that actually runs out
examples, ensuring they work without error and generate the expected
mixed datasets.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 .gitignore                                    |   3 +
 .../mix_datasets/concatenate_recipe.yaml      |   8 +
 docs/examples/mix_datasets/dataset_1.jsonl    |   5 +
 docs/examples/mix_datasets/dataset_2.jsonl    |   5 +
 docs/examples/mix_datasets/example_mixing.py  |  18 ++
 .../mix_datasets/weighted_recipe.yaml         |   9 +
 tests/functional/conftest.py                  |   7 +
 tests/functional/test_examples.py             |  40 +++++
 tests/functional/test_granular_api.py         | 170 ++++++++++--------
 tests/taxonomy.py                             |   7 +-
 tests/test_generate_data.py                   |   8 +-
 11 files changed, 203 insertions(+), 77 deletions(-)
 create mode 100644 docs/examples/mix_datasets/concatenate_recipe.yaml
 create mode 100644 docs/examples/mix_datasets/dataset_1.jsonl
 create mode 100644 docs/examples/mix_datasets/dataset_2.jsonl
 create mode 100644 docs/examples/mix_datasets/example_mixing.py
 create mode 100644 docs/examples/mix_datasets/weighted_recipe.yaml
 create mode 100644 tests/functional/test_examples.py

diff --git a/.gitignore b/.gitignore
index b2911a08..70a55104 100644
--- a/.gitignore
+++ b/.gitignore
@@ -167,3 +167,6 @@ cython_debug/
 
 # IDEs
 .vscode/
+
+# SDG examples output
+docs/examples/**/output
\ No newline at end of file
diff --git a/docs/examples/mix_datasets/concatenate_recipe.yaml b/docs/examples/mix_datasets/concatenate_recipe.yaml
new file mode 100644
index 00000000..c3d95923
--- /dev/null
+++ b/docs/examples/mix_datasets/concatenate_recipe.yaml
@@ -0,0 +1,8 @@
+# An example of how to concatenate two datasets
+# Each dataset has a sampling_size of 1.0 to take all samples from both
+datasets:
+- path: dataset_1.jsonl
+  sampling_size: 1.0
+- path: dataset_2.jsonl
+  sampling_size: 1.0
+sys_prompt: I am a reliable AI assistant.
diff --git a/docs/examples/mix_datasets/dataset_1.jsonl b/docs/examples/mix_datasets/dataset_1.jsonl
new file mode 100644
index 00000000..89059add
--- /dev/null
+++ b/docs/examples/mix_datasets/dataset_1.jsonl
@@ -0,0 +1,5 @@
+{"id": "dataset_1_1", "messages": [], "metadata": {}}
+{"id": "dataset_1_2", "messages": [], "metadata": {}}
+{"id": "dataset_1_3", "messages": [], "metadata": {}}
+{"id": "dataset_1_4", "messages": [], "metadata": {}}
+{"id": "dataset_1_5", "messages": [], "metadata": {}}
diff --git a/docs/examples/mix_datasets/dataset_2.jsonl b/docs/examples/mix_datasets/dataset_2.jsonl
new file mode 100644
index 00000000..35e063e9
--- /dev/null
+++ b/docs/examples/mix_datasets/dataset_2.jsonl
@@ -0,0 +1,5 @@
+{"id": "dataset_2_1", "messages": [], "metadata": {}}
+{"id": "dataset_2_2", "messages": [], "metadata": {}}
+{"id": "dataset_2_3", "messages": [], "metadata": {}}
+{"id": "dataset_2_4", "messages": [], "metadata": {}}
+{"id": "dataset_2_5", "messages": [], "metadata": {}}
diff --git a/docs/examples/mix_datasets/example_mixing.py b/docs/examples/mix_datasets/example_mixing.py
new file mode 100644
index 00000000..5382d9cb
--- /dev/null
+++ b/docs/examples/mix_datasets/example_mixing.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Standard
+from pathlib import Path
+
+# First Party
+from instructlab.sdg import mix_datasets
+
+output_dir = Path(__file__).parent.joinpath("output")
+output_dir.mkdir(exist_ok=True)
+
+concatenate_recipe_yaml = Path(__file__).parent.joinpath("concatenate_recipe.yaml")
+concatenated_output_jsonl = output_dir.joinpath("concatenated.jsonl")
+mix_datasets(concatenate_recipe_yaml, concatenated_output_jsonl)
+
+weighted_recipe_yaml = Path(__file__).parent.joinpath("weighted_recipe.yaml")
+weighted_output_jsonl = output_dir.joinpath("weighted.jsonl")
+mix_datasets(weighted_recipe_yaml, weighted_output_jsonl)
diff --git a/docs/examples/mix_datasets/weighted_recipe.yaml b/docs/examples/mix_datasets/weighted_recipe.yaml
new file mode 100644
index 00000000..39d6f17b
--- /dev/null
+++ b/docs/examples/mix_datasets/weighted_recipe.yaml
@@ -0,0 +1,9 @@
+# An example of how to weight one dataset over another
+# Dataset 1 has a sampling size of 2.0 to double its samples
+# Dataset 2 has a sampling size of 0.2 to take 20% of its samples
+datasets:
+- path: dataset_1.jsonl
+  sampling_size: 2.0
+- path: dataset_2.jsonl
+  sampling_size: 0.2
+sys_prompt: I am a reliable AI assistant.
diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py
index c1793a94..9c5ec111 100644
--- a/tests/functional/conftest.py
+++ b/tests/functional/conftest.py
@@ -6,9 +6,16 @@
 import pytest
 
 TESTS_PATH = pathlib.Path(__file__).parent.parent.absolute()
+EXAMPLES_PATH = TESTS_PATH.parent.joinpath("docs", "examples")
 
 
 @pytest.fixture
 def testdata_path() -> typing.Generator[pathlib.Path, None, None]:
     """Path to local test data directory"""
     yield TESTS_PATH / "testdata"
+
+
+@pytest.fixture
+def examples_path() -> typing.Generator[pathlib.Path, None, None]:
+    """Path to examples directory"""
+    yield EXAMPLES_PATH
diff --git a/tests/functional/test_examples.py b/tests/functional/test_examples.py
new file mode 100644
index 00000000..c1fe8bd0
--- /dev/null
+++ b/tests/functional/test_examples.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Standard
+import pathlib
+import shutil
+import subprocess
+import sys
+
+# First Party
+from instructlab.sdg.utils.json import jlload
+
+
+def test_example_mixing(tmp_path: pathlib.Path, examples_path: pathlib.Path):
+    example_copy_path = tmp_path.joinpath("mix_datasets")
+    shutil.copytree(examples_path.joinpath("mix_datasets"), example_copy_path)
+    script = example_copy_path.joinpath("example_mixing.py")
+    subprocess.check_call([sys.executable, str(script)], text=True)
+
+    concatenated = jlload(example_copy_path.joinpath("output", "concatenated.jsonl"))
+    assert len(concatenated) == 10
+    from_ds_1 = []
+    from_ds_2 = []
+    for sample in concatenated:
+        if sample["id"].startswith("dataset_1"):
+            from_ds_1.append(sample)
+        else:
+            from_ds_2.append(sample)
+    assert len(from_ds_1) == len(from_ds_2) == 5
+
+    weighted = jlload(example_copy_path.joinpath("output", "weighted.jsonl"))
+    assert len(weighted) == 11
+    from_ds_1 = []
+    from_ds_2 = []
+    for sample in weighted:
+        if sample["id"].startswith("dataset_1"):
+            from_ds_1.append(sample)
+        else:
+            from_ds_2.append(sample)
+    assert len(from_ds_1) == 10
+    assert len(from_ds_2) == 1
diff --git a/tests/functional/test_granular_api.py b/tests/functional/test_granular_api.py
index 33de8a25..6b83bbc6 100644
--- a/tests/functional/test_granular_api.py
+++ b/tests/functional/test_granular_api.py
@@ -4,10 +4,13 @@
 from datetime import datetime
 from unittest.mock import MagicMock
 import glob
+import os
 import pathlib
+import unittest
 
 # Third Party
 import git
+import pytest
 
 # First Party
 from instructlab.sdg import BlockRegistry
@@ -20,6 +23,7 @@
 
 # Local
 from ..mockllmblock import MockLLMBlock
+from ..taxonomy import load_test_skills
 
 
 def _clone_instructlab_taxonomy(taxonomy_dir):
@@ -29,75 +33,99 @@ def _clone_instructlab_taxonomy(taxonomy_dir):
     repo.git.checkout(taxonomy_commit)
 
 
-def test_granular_api_end_to_end(testdata_path: pathlib.Path, tmp_path: pathlib.Path):
-    # Registry our mock block so we can reference it in pipelines
-    BlockRegistry.register("MockLLMBlock")(MockLLMBlock)
-
-    # Clone a taxonomy and edit 1 file in it
-    taxonomy_dir = tmp_path.joinpath("taxonomy")
-    _clone_instructlab_taxonomy(taxonomy_dir)
-    changed_qna_yaml = taxonomy_dir.joinpath(
-        "knowledge", "science", "animals", "birds", "black_capped_chickadee", "qna.yaml"
-    )
-    with open(changed_qna_yaml, "a", encoding="utf-8") as file:
-        file.write("")
-
-    pipeline_dir = testdata_path.joinpath("mock_pipelines")
-    date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
-
-    preprocessed_dir = tmp_path.joinpath("preprocessed")
-    teacher_model_path = testdata_path.joinpath("models/instructlab/granite-7b-lab")
-    preprocess_taxonomy(
-        taxonomy_dir=taxonomy_dir,
-        output_dir=preprocessed_dir,
-        teacher_model_path=teacher_model_path,
-    )
-    chickadee_docs = glob.glob(
-        str(
-            preprocessed_dir.joinpath(
-                "documents", "knowledge_science_*", "chickadee.md"
-            )
+class TestGranularAPI(unittest.TestCase):
+    @pytest.fixture(autouse=True)
+    def _init_taxonomy(self, taxonomy_dir, testdata_path, tmp_path):
+        self.test_taxonomy = taxonomy_dir
+        self.testdata_path = testdata_path
+        self.tmp_path = tmp_path
+
+    def setUp(self):
+        test_valid_knowledge_skill_file = self.testdata_path.joinpath(
+            "test_valid_knowledge_skill.yaml"
+        )
+        untracked_knowledge_file = os.path.join("knowledge", "new", "qna.yaml")
+        test_valid_knowledge_skill = load_test_skills(test_valid_knowledge_skill_file)
+        self.test_taxonomy.create_untracked(
+            untracked_knowledge_file, test_valid_knowledge_skill
+        )
+
+    def file_list(self):
+        return glob.glob(str(self.tmp_path.joinpath("**/*")), recursive=True)
+
+    def test_granular_api_end_to_end(self):
+        # Registry our mock block so we can reference it in pipelines
+        BlockRegistry.register("MockLLMBlock")(MockLLMBlock)
+
+        # Clone a taxonomy and edit 1 file in it
+        taxonomy_dir = self.tmp_path
+
+        pipeline_dir = self.testdata_path.joinpath("mock_pipelines")
+        date_suffix = (
+            datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
+        )
+
+        preprocessed_dir = self.tmp_path.joinpath("preprocessed")
+        teacher_model_path = self.testdata_path.joinpath(
+            "models/instructlab/granite-7b-lab"
+        )
+        preprocess_taxonomy(
+            taxonomy_dir=taxonomy_dir,
+            output_dir=preprocessed_dir,
+            teacher_model_path=teacher_model_path,
+        )
+        docs = glob.glob(
+            str(preprocessed_dir.joinpath("documents", "knowledge_new_*", "phoenix.md"))
+        )
+        assert docs, f"Expected docs not found in {self.file_list()}"
+        samples_path = preprocessed_dir.joinpath("knowledge_new.jsonl")
+        assert (
+            samples_path.is_file()
+        ), f"Expected samples file not found in {self.file_list()}"
+
+        client = MagicMock()
+        client.server_supports_batched = False
+        generated_dir = self.tmp_path.joinpath("generated")
+        generate_taxonomy(
+            client=client,
+            input_dir=preprocessed_dir,
+            output_dir=generated_dir,
+            pipeline=pipeline_dir,
+            num_cpus=1,  # Test is faster running on a single CPU vs forking
+            batch_size=0,  # Disable batch for tiny dataset and fastest test
+        )
+        generated_samples_path = generated_dir.joinpath("knowledge_new.jsonl")
+        assert (
+            generated_samples_path.is_file()
+        ), f"Generated samples not found in {self.file_list()}"
+
+        postprocessed_dir = self.tmp_path.joinpath("postprocessed")
+        postprocess_taxonomy(
+            input_dir=generated_dir,
+            output_dir=postprocessed_dir,
+            date_suffix=date_suffix,
+            pipeline=pipeline_dir,
+        )
+        knowledge_recipe_file = postprocessed_dir.joinpath(
+            f"knowledge_recipe_{date_suffix}.yaml"
+        )
+        assert (
+            knowledge_recipe_file.is_file()
+        ), f"Generated knowledge recipe file not found in {self.file_list()}"
+        skills_recipe_file = postprocessed_dir.joinpath(
+            f"skills_recipe_{date_suffix}.yaml"
+        )
+        assert (
+            skills_recipe_file.is_file()
+        ), f"Generated skills recipe file not found in {self.file_list()}"
+
+        mixed_skills_output_file = (
+            f"{postprocessed_dir}/skills_train_msgs_{date_suffix}.jsonl"
+        )
+        mix_datasets(
+            recipe_file=f"{postprocessed_dir}/skills_recipe_{date_suffix}.yaml",
+            output_file=mixed_skills_output_file,
         )
-    )
-    assert chickadee_docs
-    chickadee_samples_path = preprocessed_dir.joinpath(
-        "knowledge_science_animals_birds_black_capped_chickadee.jsonl"
-    )
-    assert chickadee_samples_path.is_file()
-
-    client = MagicMock()
-    client.server_supports_batched = False
-    generated_dir = tmp_path.joinpath("generated")
-    generate_taxonomy(
-        client=client,
-        input_dir=preprocessed_dir,
-        output_dir=generated_dir,
-        pipeline=pipeline_dir,
-    )
-    generated_chickadee_samples_path = generated_dir.joinpath(
-        "knowledge_science_animals_birds_black_capped_chickadee.jsonl"
-    )
-    assert generated_chickadee_samples_path.is_file()
-
-    postprocessed_dir = tmp_path.joinpath("postprocessed")
-    postprocess_taxonomy(
-        input_dir=generated_dir,
-        output_dir=postprocessed_dir,
-        date_suffix=date_suffix,
-        pipeline=pipeline_dir,
-    )
-    knowledge_recipe_file = postprocessed_dir.joinpath(
-        f"knowledge_recipe_{date_suffix}.yaml"
-    )
-    assert knowledge_recipe_file.is_file()
-    skills_recipe_file = postprocessed_dir.joinpath(f"skills_recipe_{date_suffix}.yaml")
-    assert skills_recipe_file.is_file()
-
-    mixed_skills_output_file = (
-        f"{postprocessed_dir}/skills_train_msgs_{date_suffix}.jsonl"
-    )
-    mix_datasets(
-        recipe_file=f"{postprocessed_dir}/skills_recipe_{date_suffix}.yaml",
-        output_file=mixed_skills_output_file,
-    )
-    assert pathlib.Path(mixed_skills_output_file).is_file()
+        assert pathlib.Path(
+            mixed_skills_output_file
+        ).is_file(), f"Generated mixed output not found in {self.file_list()}"
diff --git a/tests/taxonomy.py b/tests/taxonomy.py
index 227c2534..01904ad3 100644
--- a/tests/taxonomy.py
+++ b/tests/taxonomy.py
@@ -2,7 +2,7 @@
 
 # Standard
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 import shutil
 
 # Third Party
@@ -68,3 +68,8 @@ def __enter__(self):
 
     def __exit__(self, *args):
         self.teardown()
+
+
+def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]:
+    with open(skills_file_path, "r", encoding="utf-8") as skills_file:
+        return yaml.safe_load(skills_file)
diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py
index 0116253e..0e718383 100644
--- a/tests/test_generate_data.py
+++ b/tests/test_generate_data.py
@@ -28,6 +28,9 @@
     generate_data,
 )
 
+# Local
+from .taxonomy import load_test_skills
+
 TEST_SYS_PROMPT = "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant."
 
 TEST_TAXONOMY_BASE = "main"
@@ -232,11 +235,6 @@ def add_question_mark(q):
     return train_samples
 
 
-def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]:
-    with open(skills_file_path, "r", encoding="utf-8") as skills_file:
-        return yaml.safe_load(skills_file)
-
-
 def _noop_llmblock_generate(self, samples):
     """Generate mock output based on input samples.