From 0f784655c38c4e1aa4426fc8b0b98b8c27fb01c3 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Thu, 5 Dec 2024 12:39:08 -0500 Subject: [PATCH 1/3] Split `generate_data` into multiple discrete steps This doesn't move things out into separate files yet, but it does split the existing functionality of `generate_date` into multiple discrete steps and changes `generate_date` to just call those steps. This is a step towards cleaner separation between the steps and creating top-level Python APIs for each discrete step for advanced use-cases that don't just want an entire single step generation pipeline. Signed-off-by: Ben Browning --- src/instructlab/sdg/__init__.py | 3 +- src/instructlab/sdg/datamixing.py | 23 +- src/instructlab/sdg/generate_data.py | 595 +++++++++++++----- src/instructlab/sdg/pipeline.py | 5 +- src/instructlab/sdg/utils/json.py | 6 + src/instructlab/sdg/utils/models.py | 9 +- src/instructlab/sdg/utils/taxonomy.py | 30 +- tests/conftest.py | 2 +- tests/functional/__init__.py | 0 tests/functional/test_granular_api.py | 101 +++ tests/mockllmblock.py | 55 ++ tests/test_generate_data.py | 42 +- tests/test_models.py | 3 + .../mock_pipelines/freeform_skills.yaml | 53 ++ .../mock_pipelines/grounded_skills.yaml | 70 +++ tests/testdata/mock_pipelines/knowledge.yaml | 113 ++++ 16 files changed, 902 insertions(+), 208 deletions(-) create mode 100644 tests/functional/__init__.py create mode 100644 tests/functional/test_granular_api.py create mode 100644 tests/mockllmblock.py create mode 100644 tests/testdata/mock_pipelines/freeform_skills.yaml create mode 100644 tests/testdata/mock_pipelines/grounded_skills.yaml create mode 100644 tests/testdata/mock_pipelines/knowledge.yaml diff --git a/src/instructlab/sdg/__init__.py b/src/instructlab/sdg/__init__.py index 490df8e4..5cc9d95f 100644 --- a/src/instructlab/sdg/__init__.py +++ b/src/instructlab/sdg/__init__.py @@ -29,6 +29,7 @@ "FULL_PIPELINES_PACKAGE", "SIMPLE_PIPELINES_PACKAGE", "generate_data", + "mix_datasets", ) # Local @@ -50,7 +51,7 @@ SelectorBlock, SetToMajorityValueBlock, ) -from .generate_data import generate_data +from .generate_data import generate_data, mix_datasets from .pipeline import ( FULL_PIPELINES_PACKAGE, SIMPLE_PIPELINES_PACKAGE, diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index e6ca8675..de31e136 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -160,7 +160,7 @@ def _create_mixed_dataset(self, num_proc): Create the final mixed dataset by loading, sampling, and concatenating all datasets in this recipe """ - if not self.dataset_added: + if not self.datasets: logger.error("No dataset added to the recipe") mixed_ds = self._load_and_sample_datasets(num_proc) @@ -726,19 +726,36 @@ def collect( sampling_size=self.NUM_SYNTH_SKILLS, ) + def _write_mixed_recipe(self, recipe, output_file_recipe): + """ + Write the recipes created during data mixing without writing the actual + mixed datasets to disk. + """ + full_recipe_path = os.path.join(self.output_dir, output_file_recipe) + recipe.save_recipe(full_recipe_path) + def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data): """ Mix the generated leaf node data into a single dataset and write it to disk. The heavy lifting is delegated to the Recipe class. """ + self._write_mixed_recipe(recipe, output_file_recipe) if recipe.dataset_added: - full_recipe_path = os.path.join(self.output_dir, output_file_recipe) - recipe.save_recipe(full_recipe_path) recipe.save_mixed_dataset( os.path.join(self.output_dir, output_file_data), self.num_procs, ) + def write_recipes(self): + self._write_mixed_recipe( + self.knowledge_recipe, + self.output_file_knowledge_recipe, + ) + self._write_mixed_recipe( + self.skills_recipe, + self.output_file_skills_recipe, + ) + def generate(self): self._gen_mixed_data( self.knowledge_recipe, diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index badf3834..a14a190a 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -5,7 +5,7 @@ from importlib import resources from pathlib import Path from typing import Optional -import dataclasses +import glob import json import logging import os @@ -13,13 +13,19 @@ # Third Party # instructlab - All of these need to go away (other than sdg) - issue #6 +from datasets import Dataset from xdg_base_dirs import xdg_data_dirs, xdg_data_home import openai import yaml # First Party from instructlab.sdg.blocks.llmblock import DEFAULT_MAX_NUM_TOKENS -from instructlab.sdg.datamixing import DataMixer, _get_question_hack, _get_response_hack +from instructlab.sdg.datamixing import ( + DataMixer, + Recipe, + _get_question_hack, + _get_response_hack, +) from instructlab.sdg.eval_data import generate_eval_task_data, mmlubench_pipe_init from instructlab.sdg.pipeline import ( FULL_PIPELINES_PACKAGE, @@ -27,8 +33,8 @@ Pipeline, PipelineContext, ) -from instructlab.sdg.utils import GenerateException, models -from instructlab.sdg.utils.json import jldump +from instructlab.sdg.utils import GenerateException +from instructlab.sdg.utils.json import jldump, jlload from instructlab.sdg.utils.taxonomy import ( leaf_node_to_samples, read_taxonomy_leaf_nodes, @@ -38,6 +44,10 @@ _SYS_PROMPT = "I am a Red Hat® Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.0-8b-base model. My primary role is to serve as a chat assistant." +DEFAULT_CHUNK_WORD_COUNT = 1000 +DEFAULT_TAXONOMY_BASE = "empty" +DEFAULT_SERVER_CTX_SIZE = 4096 + def _unescape(s): return bytes(s, "utf-8").decode("utf-8").strip() @@ -115,20 +125,21 @@ def _gen_train_data( def _knowledge_seed_example_to_test_data(seed_example, system_prompt): res = [] - for qna in seed_example["questions_and_answers"]: - user = qna["question"] + "\n" + seed_example["context"] - res.append( - { - "system": system_prompt, - "user": _unescape(user), - "assistant": _unescape(qna["answer"]), - } - ) + for i in range(3): + idx = i + 1 + user = seed_example[f"icl_query_{idx}"] + "\n" + seed_example["icl_document"] + test_sample = { + "user": _unescape(user), + "assistant": _unescape(seed_example[f"icl_response_{idx}"]), + } + if system_prompt: + test_sample["system"] = system_prompt + res.append(test_sample) return res def _gen_test_data( - leaf_nodes, + seed_examples, output_file_test, system_prompt, ): @@ -137,30 +148,29 @@ def _gen_test_data( in instructlab/instructlab. """ test_data = [] - for _, leaf_node in leaf_nodes.items(): - for seed_example in leaf_node: - if "questions_and_answers" in seed_example: - test_data.extend( - _knowledge_seed_example_to_test_data(seed_example, system_prompt) - ) - continue + for seed_example in seed_examples: + if "icl_query_1" in seed_example: + test_data.extend( + _knowledge_seed_example_to_test_data(seed_example, system_prompt) + ) + continue - # skill seed example + # skill seed example - user = seed_example["instruction"] # question + user = seed_example["seed_question"] # question - if len(seed_example["input"]) > 0: - user += "\n" + seed_example["input"] # context + if seed_example["leaf_node_type"] == "grounded_skill": + user += "\n" + seed_example["seed_context"] # context - test_data.append( - { - "system": system_prompt, - "user": _unescape(user), - "assistant": _unescape(seed_example["output"]), # answer - } - ) + test_sample = { + "user": _unescape(user), + "assistant": _unescape(seed_example["seed_response"]), # answer + } + if system_prompt: + test_sample["system"] = system_prompt + test_data.append(test_sample) - jldump(test_data, output_file_test) + jldump(test_data, output_file_test) def _check_pipeline_dir(pipeline): @@ -171,6 +181,31 @@ def _check_pipeline_dir(pipeline): ) +def _locate_docling_models(): + # Search for the models in User and Site data directories + data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")] + data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs()) + + docling_model_path = None + sdg_models_path = docling_model_path + for d in data_dirs: + if os.path.exists(os.path.join(d, "models")): + sdg_models_path = os.path.join(d, "models") + break + + if sdg_models_path is not None: + try: + with open( + os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8" + ) as file: + config = yaml.safe_load(file) + docling_model_path = config["models"][0]["path"] + except (FileNotFoundError, NotADirectoryError, PermissionError) as e: + logger.warning(f"unable to read docling models path from config.yaml {e}") + + return docling_model_path + + def _context_init( client: openai.OpenAI, model_family: str, @@ -208,23 +243,6 @@ def _sdg_init(ctx, pipeline): data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")] data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs()) - docling_model_path = None - sdg_models_path = docling_model_path - for d in data_dirs: - if os.path.exists(os.path.join(d, "models")): - sdg_models_path = os.path.join(d, "models") - break - - if sdg_models_path is not None: - try: - with open( - os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8" - ) as file: - config = yaml.safe_load(file) - docling_model_path = config["models"][0]["path"] - except (FileNotFoundError, NotADirectoryError, PermissionError) as e: - logger.warning(f"unable to read docling models path from config.yaml {e}") - for d in data_dirs: pipeline_path = os.path.join(d, "pipelines", pipeline) if os.path.exists(pipeline_path): @@ -256,12 +274,11 @@ def load_pipeline(yaml_basename): load_pipeline("knowledge.yaml"), load_pipeline("freeform_skills.yaml"), load_pipeline("grounded_skills.yaml"), - docling_model_path, ) def _mixer_init( - ctx, + num_procs, output_dir, date_suffix, knowledge_auxiliary_inst, @@ -275,91 +292,135 @@ def _mixer_init( output_dir, date_suffix, system_prompt, - ctx.dataset_num_procs, + num_procs, knowledge_auxiliary_inst, ) -# This is part of the public API, and used by instructlab. -# TODO - parameter removal needs to be done in sync with a CLI change. -# to be removed: logger -def generate_data( - client: openai.OpenAI, - logger: logging.Logger = logger, # pylint: disable=redefined-outer-name - system_prompt: Optional[str] = None, - use_legacy_pretraining_format: Optional[bool] = True, - model_family: Optional[str] = None, - model_name: Optional[str] = None, - num_cpus: Optional[int] = None, - num_instructions_to_generate: Optional[int] = 30, - taxonomy: Optional[str] = None, # TODO rename to taxonomy_path to match config - taxonomy_base: Optional[str] = None, - output_dir: Optional[str] = None, - console_output=True, - yaml_rules: Optional[str] = None, - chunk_word_count=None, - server_ctx_size=None, - pipeline: Optional[str] = "simple", - batch_size: Optional[int] = None, - checkpoint_dir: Optional[str] = None, - max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS, -) -> None: - """Generate data for training and testing a model. +def _extract_leaf_node_path_and_type(sample): + leaf_node_path = sample.get("leaf_node_path", "unknown") + leaf_node_type = sample.get("leaf_node_type") + return leaf_node_path, leaf_node_type - This currently serves as the primary interface from the `ilab` CLI to the `sdg` library. - It is somewhat a transitionary measure, as this function existed back when all of the - functionality was embedded in the CLI. At some stage, we expect to evolve the CLI to - use the SDG library constructs directly, and this function will likely be removed. - Args: - pipeline: This argument may be either an alias defined in a user or site "data directory" - or an alias defined by the sdg library ("simple", "full")(if the data directory has no matches), - or an absolute path to a directory containing the pipeline YAML files. - We expect three files to be present in this directory: "knowledge.yaml", - "freeform_skills.yaml", and "grounded_skills.yaml". +def preprocess_taxonomy( + taxonomy_dir, + output_dir, + chunk_word_count=DEFAULT_CHUNK_WORD_COUNT, + server_ctx_size=DEFAULT_SERVER_CTX_SIZE, + taxonomy_base=DEFAULT_TAXONOMY_BASE, + teacher_model_path: Optional[str] = None, + yaml_rules: Optional[str] = None, + test_output_file: Optional[str] = None, + system_prompt: Optional[str] = None, +): """ - generate_start = time.time() + Preprocess a taxonomy into input samples suitable for use with + data generation pipelines. This does the following steps: + + - Determine changed leaf nodes in the taxonomy + - Retrieve knowledge documents for changed taxonomy leaf nodes + - Convert any non-markdown knowledge documents to markdown + - Write the Docling json and markdown outputs from this conversion to + disk for other processes to consume if needed. + - Chunk the converted knowledge documents to the desired chunk sizes. + - Turn the qna.yaml and knowledge documents into samples in the format + expected by the `simple` and `full` data generation pipelines shipped + in SDG. + - Write these samples to disk, with one file per taxonomy leaf node. - system_prompt = system_prompt if system_prompt is not None else _SYS_PROMPT + Args: + taxonomy_dir: The path to the taxonomy + output_dir: Where to write the samples create for use with data generation + test_output_file: Path to write the test samples jsonl file + chunk_word_count: The target number of words per document chunk + server_ctx_size: The maximum number of tokens the inference server used + during data generation can handle + taxonomy_base: Determines how we calculate what has changed. This should + be a git reference or the special value of 'empty' which + means assume the entire taxonomy has changed. + teacher_model_path: Path to the teacher model on disk, which we'll use to + load its tokenizer for use with document chunking. + yaml_rules: Path to a custom YAML rules file for YAML linting. + test_output_file: Path to write a file with generated test samples + system_prompt: System prompt to use when generating test samples + + Returns: + List[str]: The list of output sample files written to disk. - # FIXME: remove this when ilab knows to pass batch_size=0 with llama.cpp - if batch_size is None: - batch_size = 0 + """ + logging.info("Converting taxonomy to samples") + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True) + output_files = [] - if not os.path.exists(output_dir): - os.mkdir(output_dir) + if not (taxonomy_dir and os.path.exists(taxonomy_dir)): + raise GenerateException(f"Error: taxonomy ({taxonomy_dir}) does not exist.") - if not (taxonomy and os.path.exists(taxonomy)): - raise GenerateException(f"Error: taxonomy ({taxonomy}) does not exist.") - - date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_") - document_output_dir = Path(output_dir) / f"documents-{date_suffix}" + document_output_dir = output_dir.joinpath("documents") + docling_model_path = _locate_docling_models() leaf_nodes = read_taxonomy_leaf_nodes( - taxonomy, taxonomy_base, yaml_rules, document_output_dir + taxonomy_dir, taxonomy_base, yaml_rules, document_output_dir ) if not leaf_nodes: raise GenerateException("Error: No new leaf nodes found in the taxonomy.") - name = Path(model_name).stem # Just in case it is a file path - output_file_messages = f"messages_{name}_{date_suffix}.jsonl" - output_file_test = f"test_{name}_{date_suffix}.jsonl" - output_file_train = f"train_{name}_{date_suffix}.jsonl" + # TODO: This is all a temporary hack here, as we either need to + # remove, deprecate, or otherwise determine the right way to + # support test samples + all_samples = [] + for leaf_node in leaf_nodes.values(): + leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_") + samples = leaf_node_to_samples( + leaf_node, + server_ctx_size, + chunk_word_count, + document_output_dir, + teacher_model_path, + docling_model_path=docling_model_path, + ) - _gen_test_data( - leaf_nodes, - os.path.join(output_dir, output_file_test), - system_prompt, - ) + if not samples: + raise GenerateException("Error: No samples found in leaf node.") - logger.debug(f"Generating to: {os.path.join(output_dir, output_file_test)}") + logger.debug("Samples: %s", samples) + + output_file = output_dir.joinpath(f"{leaf_node_path}.jsonl") + all_samples.extend(samples) + jldump(samples, output_file) + output_files.append(str(output_file)) + + if test_output_file: + _gen_test_data( + all_samples, + test_output_file, + system_prompt, + ) + logger.debug(f"Generating test data to: {test_output_file}") + logger.info("Taxonomy converted to samples and written to %s", output_dir) + return output_files - model_family = models.get_model_family(model_family, model_name) +def generate_taxonomy( + client: openai.OpenAI, + input_dir: str, + output_dir: str, + logger: logging.Logger = logger, # pylint: disable=redefined-outer-name + model_family: Optional[str] = None, + model_id: Optional[str] = None, + num_cpus: Optional[int] = None, + num_instructions_to_generate: Optional[int] = 30, + console_output=True, + pipeline: Optional[str] = "simple", + batch_size: Optional[int] = None, + checkpoint_dir: Optional[str] = None, + max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS, +): ctx = _context_init( client, model_family, - model_name, + model_id, num_instructions_to_generate, checkpoint_dir, 1, # save_freq @@ -368,20 +429,8 @@ def generate_data( max_num_tokens=max_num_tokens, ) - knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe, docling_model_path = ( - _sdg_init(ctx, pipeline) - ) - - # Make sure checkpointing is disabled (we don't want this pipeline to load checkpoints from the main pipeline) - mmlu_ctx = dataclasses.replace(ctx, checkpoint_dir=None) - mmlu_bench_pipe = mmlubench_pipe_init(mmlu_ctx) - - mixer = _mixer_init( - ctx, - output_dir, - date_suffix, - knowledge_pipe.auxiliary_inst, - system_prompt, + knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe = _sdg_init( + ctx, pipeline ) if console_output: @@ -389,76 +438,292 @@ def generate_data( "Synthesizing new instructions. If you aren't satisfied with the generated instructions, interrupt training (Ctrl-C) and try adjusting your YAML files. Adding more examples may help." ) - generated_data = [] - empty_sdg_leaf_nodes = [] - for leaf_node in leaf_nodes.values(): - is_knowledge = False - leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_") - samples = leaf_node_to_samples( - leaf_node, # pylint: disable=duplicate-code - server_ctx_size, - chunk_word_count, - document_output_dir, - model_name, - docling_model_path=docling_model_path, - ) + input_files = glob.glob(f"{input_dir}/*.jsonl") + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True) + empty_input_files = [] + for input_file in input_files: + logger.debug("Generating data from input file: %s", input_file) + samples = jlload(input_file) if not samples: - raise GenerateException("Error: No samples found in leaf node.") - - if "document" in samples.column_names: + raise GenerateException( + "Error: No samples found in input file {input_file}" + ) + # For now we assume every sample in the file is the same type + first_sample = samples[0] + leaf_node_path, leaf_node_type = _extract_leaf_node_path_and_type(first_sample) + if leaf_node_type == "knowledge": pipe = knowledge_pipe - is_knowledge = True - - elif "seed_context" in samples.column_names: + elif leaf_node_type == "grounded_skill": pipe = grounded_skills_pipe - else: pipe = freeform_skills_pipe - logger.debug("Samples: %s", samples) + samples_ds = Dataset.from_list(samples) + logger.debug("Generating from samples: %s", samples_ds) - new_generated_data = pipe.generate(samples, leaf_node_path) + new_generated_data = pipe.generate(samples_ds, leaf_node_path) if len(new_generated_data) == 0: - empty_sdg_leaf_nodes.append(leaf_node_path) - logger.warning("Empty dataset for qna node: %s", leaf_node_path) + empty_input_files.append(input_file) + logger.warning("Empty generated dataset for input file: %s", input_file) continue - generated_data.append(new_generated_data) - logger.info("Generated %d samples", len(generated_data)) - logger.debug("Generated data: %s", generated_data) + output_file = os.path.join(output_dir, os.path.basename(input_file)) + jldump(new_generated_data, output_file) + logger.info("Generated %d samples", len(new_generated_data)) + logger.debug("Generated data: %s", new_generated_data) + + if len(empty_input_files) > 0: + logger.warning( + "Input sample files with empty sdg output: {}".format( + " ".join(empty_input_files) + ) + ) + + +def generate_taxonomy_eval( + client: openai.OpenAI, + input_dir: str, + output_dir: str, + date_suffix: str, + model_family: Optional[str] = None, + model_id: Optional[str] = None, + num_cpus: Optional[int] = None, + num_instructions_to_generate: Optional[int] = 30, + batch_size: Optional[int] = None, + max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS, +): + ctx = _context_init( + client, + model_family, + model_id, + num_instructions_to_generate, + None, # disable checkpoints for eval pipeline + 1, # save_freq + batch_size=batch_size, + batch_num_workers=num_cpus, + max_num_tokens=max_num_tokens, + ) + mmlu_bench_pipe = mmlubench_pipe_init(ctx) + + input_files = glob.glob(f"{input_dir}/*.jsonl") + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True) + + for input_file in input_files: + logger.debug("Generating eval data from input file: %s", input_file) + samples = jlload(input_file) + if not samples: + raise GenerateException( + "Error: No samples found in input file {input_file}" + ) + samples_ds = Dataset.from_list(samples) + # For now we assume every sample in the file is the same type + first_sample = samples[0] + leaf_node_path, leaf_node_type = _extract_leaf_node_path_and_type(first_sample) + is_knowledge = False + if leaf_node_type == "knowledge": + is_knowledge = True if is_knowledge: - # generate mmlubench data for the current leaf node generate_eval_task_data( mmlu_bench_pipe, leaf_node_path, - samples, + samples_ds, output_dir, date_suffix, ) + +def postprocess_taxonomy( + input_dir: str, + output_dir: str, + date_suffix: str, + pipeline: Optional[str] = "simple", + num_procs: Optional[int] = PipelineContext.DEFAULT_DATASET_NUM_PROCS, + system_prompt: Optional[str] = _SYS_PROMPT, + use_legacy_pretraining_format: Optional[bool] = True, +): + knowledge_pipe, _, _ = _sdg_init(None, pipeline) + mixer = _mixer_init( + num_procs, + output_dir, + date_suffix, + knowledge_pipe.auxiliary_inst, + system_prompt, + ) + + input_files = glob.glob(f"{input_dir}/*.jsonl") + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True) + + output_file_messages = f"messages_{date_suffix}.jsonl" + output_file_train = f"train_{date_suffix}.jsonl" + + all_generated_data = [] + for input_file in input_files: + logger.debug( + "Postprocessing generated taxonomy date in input file: %s", input_file + ) + samples = jlload(input_file) + if not samples: + raise GenerateException( + "Error: No samples found in input file {input_file}" + ) + # For now we assume every sample in the file is the same type + first_sample = samples[0] + leaf_node_path, leaf_node_type = _extract_leaf_node_path_and_type(first_sample) + is_knowledge = False + if leaf_node_type == "knowledge": + is_knowledge = True + + samples_ds = Dataset.from_list(samples) + logger.debug("Postprocessing from samples: %s", samples_ds) + all_generated_data.append(samples_ds) + mixer.collect( leaf_node_path, - new_generated_data, + samples_ds, is_knowledge, use_legacy_pretraining_format, ) _gen_train_data( - generated_data, + all_generated_data, os.path.join(output_dir, output_file_train), os.path.join(output_dir, output_file_messages), system_prompt, ) - mixer.generate() + mixer.write_recipes() + + +def mix_datasets( + recipe_file: str, + output_file: str, + num_proc: Optional[int] = 8, +): + recipe = Recipe(recipe_file) + if recipe.datasets: + recipe.save_mixed_dataset(output_file, num_proc) + else: + logger.info("Not mixing empty recipe file: %s", recipe_file) + + +# This is part of the public API, and used by instructlab. +# TODO - parameter removal needs to be done in sync with a CLI change. +# to be removed: logger +def generate_data( + client: openai.OpenAI, + logger: logging.Logger = logger, # pylint: disable=redefined-outer-name + system_prompt: Optional[str] = None, + use_legacy_pretraining_format: Optional[bool] = True, + model_family: Optional[str] = None, + model_name: Optional[str] = None, + num_cpus: Optional[int] = None, + num_instructions_to_generate: Optional[int] = 30, + taxonomy: Optional[str] = None, # TODO rename to taxonomy_path to match config + taxonomy_base: Optional[str] = None, + output_dir: Optional[str] = None, + console_output=True, + yaml_rules: Optional[str] = None, + chunk_word_count=None, + server_ctx_size=None, + pipeline: Optional[str] = "simple", + batch_size: Optional[int] = None, + checkpoint_dir: Optional[str] = None, + max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS, +) -> None: + """Generate data for training and testing a model. + + This currently serves as the primary interface from the `ilab` CLI to the `sdg` library. + It is somewhat a transitionary measure, as this function existed back when all of the + functionality was embedded in the CLI. At some stage, we expect to evolve the CLI to + use the SDG library constructs directly, and this function will likely be removed. + + Args: + pipeline: This argument may be either an alias defined in a user or site "data directory" + or an alias defined by the sdg library ("simple", "full")(if the data directory has no matches), + or an absolute path to a directory containing the pipeline YAML files. + We expect three files to be present in this directory: "knowledge.yaml", + "freeform_skills.yaml", and "grounded_skills.yaml". + """ + generate_start = time.time() + + system_prompt = system_prompt if system_prompt is not None else _SYS_PROMPT + + # FIXME: remove this when ilab knows to pass batch_size=0 with llama.cpp + if batch_size is None: + batch_size = 0 + + date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_") + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + output_file_test = output_dir.joinpath(f"test_{date_suffix}.jsonl") + preprocessed_dir = output_dir.joinpath(f"preprocessed_{date_suffix}") + generated_dir = output_dir.joinpath(f"generated_{date_suffix}") + + # This writes samples to disk in our output_dir and returns the + # list of files created + preprocess_taxonomy( + taxonomy, + output_dir=preprocessed_dir, + chunk_word_count=chunk_word_count, + server_ctx_size=server_ctx_size, + taxonomy_base=taxonomy_base, + teacher_model_path=model_name, + yaml_rules=yaml_rules, + test_output_file=output_file_test, + system_prompt=system_prompt, + ) + + generate_taxonomy( + client, + input_dir=preprocessed_dir, + output_dir=generated_dir, + logger=logger, + model_family=model_family, + model_id=model_name, + num_cpus=num_cpus, + num_instructions_to_generate=num_instructions_to_generate, + console_output=console_output, + pipeline=pipeline, + batch_size=batch_size, + checkpoint_dir=checkpoint_dir, + max_num_tokens=max_num_tokens, + ) + + generate_taxonomy_eval( + input_dir=preprocessed_dir, + output_dir=output_dir, + date_suffix=date_suffix, + client=client, + model_family=model_family, + model_id=model_name, + num_cpus=num_cpus, + num_instructions_to_generate=num_instructions_to_generate, + batch_size=batch_size, + max_num_tokens=max_num_tokens, + ) + + postprocess_taxonomy( + input_dir=generated_dir, + output_dir=output_dir, + date_suffix=date_suffix, + pipeline=pipeline, + system_prompt=system_prompt, + use_legacy_pretraining_format=use_legacy_pretraining_format, + ) + + mix_datasets( + recipe_file=f"{output_dir}/skills_recipe_{date_suffix}.yaml", + output_file=f"{output_dir}/skills_train_msgs_{date_suffix}.jsonl", + ) + mix_datasets( + recipe_file=f"{output_dir}/knowledge_recipe_{date_suffix}.yaml", + output_file=f"{output_dir}/knowledge_train_msgs_{date_suffix}.jsonl", + ) generate_duration = time.time() - generate_start logger.info(f"Generation took {generate_duration:.2f}s") - if len(empty_sdg_leaf_nodes) > 0: - logger.warning( - "Leaf nodes with empty sdg output: {}".format( - " ".join(empty_sdg_leaf_nodes) - ) - ) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 59613a8e..ce362668 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -16,7 +16,7 @@ # First Party from instructlab.sdg.checkpointing import Checkpointer -from instructlab.sdg.utils import pandas +from instructlab.sdg.utils import models, pandas # Local from .blocks import llmblock @@ -71,6 +71,9 @@ class PipelineContext: # pylint: disable=too-many-instance-attributes batch_size: int = DEFAULT_BATCH_SIZE batch_num_workers: Optional[int] = None + def __post_init__(self): + self.model_family = models.get_model_family(self.model_family, self.model_id) + @property def batching_enabled(self) -> bool: """Batching is enabled IFF the batch size is specified and the number of diff --git a/src/instructlab/sdg/utils/json.py b/src/instructlab/sdg/utils/json.py index 041d817b..1ec0b70a 100644 --- a/src/instructlab/sdg/utils/json.py +++ b/src/instructlab/sdg/utils/json.py @@ -60,3 +60,9 @@ def jldump(data: Iterable[Any], out: str | io.IOBase) -> None: for entry in data: json.dump(entry, outfile, ensure_ascii=False) outfile.write("\n") + + +def jlload(f, mode="r"): + """Load a .jsonl file into a list of dictionaries.""" + with _make_r_io_base(f, mode) as f_: + return [json.loads(l) for l in f_.read().splitlines()] diff --git a/src/instructlab/sdg/utils/models.py b/src/instructlab/sdg/utils/models.py index da01421b..b6375e57 100644 --- a/src/instructlab/sdg/utils/models.py +++ b/src/instructlab/sdg/utils/models.py @@ -22,5 +22,10 @@ def get_model_family(model_family, model_path): return model_family # Try to guess the model family based on the model's filename - guess = re.match(r"^\w*", os.path.basename(model_path)).group(0).lower() - return guess if guess in registry else DEFAULT_MODEL_FAMILY + if model_path: + guess = re.match(r"^\w*", os.path.basename(model_path)).group(0).lower() + if guess in registry: + return guess + + # Nothing was found, so just return the default + return DEFAULT_MODEL_FAMILY diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 2a23072f..ed0d7940 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -418,6 +418,7 @@ def map_chunks_to_icls(chunks: List, leaf_node: Dict) -> Dataset: "icl_document": icl_.get("context", ""), "document_outline": icl_.get("document_outline", ""), "domain": domain, + "leaf_node_type": "knowledge", } qna_pairs = icl_.get("questions_and_answers", []) @@ -431,7 +432,7 @@ def map_chunks_to_icls(chunks: List, leaf_node: Dict) -> Dataset: chunked_dataset.append(record) - return Dataset.from_list(chunked_dataset) + return chunked_dataset def _knowledge_leaf_node_to_samples( @@ -464,12 +465,23 @@ def _skill_leaf_node_to_samples(leaf_node): for i in range(len(leaf_node)): samples.append({}) samples[-1]["task_description"] = leaf_node[i]["task_description"] + sample_type = "freeform_skill" if leaf_node[i].get("input"): + sample_type = "grounded_skill" samples[-1]["seed_context"] = leaf_node[i]["input"] samples[-1]["seed_question"] = leaf_node[i]["instruction"] samples[-1]["seed_response"] = leaf_node[i]["output"] + samples[-1]["leaf_node_type"] = sample_type - return Dataset.from_list(samples) + return samples + + +def _enrich_metadata(samples, leaf_node): + leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_") + for i, sample in enumerate(samples): + sample["leaf_node_path"] = leaf_node_path + samples[i] = sample + return samples def leaf_node_to_samples( @@ -480,15 +492,17 @@ def leaf_node_to_samples( model_name, docling_model_path=None, ): - if not leaf_node: - return [] - if leaf_node[0].get("documents"): - return _knowledge_leaf_node_to_samples( - leaf_node, # pylint: disable=duplicate-code + samples = [] + if leaf_node and leaf_node[0].get("documents"): + samples = _knowledge_leaf_node_to_samples( + leaf_node, server_ctx_size, chunk_word_count, document_output_dir, model_name, docling_model_path, ) - return _skill_leaf_node_to_samples(leaf_node) + elif leaf_node: + samples = _skill_leaf_node_to_samples(leaf_node) + samples = _enrich_metadata(samples, leaf_node) + return Dataset.from_list(samples) diff --git a/tests/conftest.py b/tests/conftest.py index ed3fd8c4..be3f249a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -30,7 +30,7 @@ def testdata_path() -> typing.Generator[pathlib.Path, None, None]: def get_ctx(**kwargs) -> PipelineContext: kwargs.setdefault("client", mock.MagicMock()) - kwargs.setdefault("model_family", "test") + kwargs.setdefault("model_family", "merlinite") kwargs.setdefault("model_id", "test-model") kwargs.setdefault("num_instructions_to_generate", 10) kwargs.setdefault("dataset_num_procs", 1) diff --git a/tests/functional/__init__.py b/tests/functional/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/functional/test_granular_api.py b/tests/functional/test_granular_api.py new file mode 100644 index 00000000..79207368 --- /dev/null +++ b/tests/functional/test_granular_api.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Standard +from datetime import datetime +from unittest.mock import MagicMock +import glob +import pathlib + +# Third Party +import git + +# First Party +from instructlab.sdg import BlockRegistry +from instructlab.sdg.generate_data import ( + generate_taxonomy, + mix_datasets, + postprocess_taxonomy, + preprocess_taxonomy, +) + +# Local +from ..mockllmblock import MockLLMBlock + + +def _clone_instructlab_taxonomy(taxonomy_dir): + taxonomy_repo_url = "https://github.com/instructlab/taxonomy" + taxonomy_commit = "dfa3afaf26f40f923cf758389719619ec9b1ddb1" + repo = git.Repo.clone_from(taxonomy_repo_url, taxonomy_dir, no_checkout=True) + repo.git.checkout(taxonomy_commit) + + +def test_granular_api_end_to_end(testdata_path: pathlib.Path, tmp_path: pathlib.Path): + # Registry our mock block so we can reference it in pipelines + BlockRegistry.register("MockLLMBlock")(MockLLMBlock) + + # Clone a taxonomy and edit 1 file in it + taxonomy_dir = tmp_path.joinpath("taxonomy") + _clone_instructlab_taxonomy(taxonomy_dir) + changed_qna_yaml = taxonomy_dir.joinpath( + "knowledge", "science", "animals", "birds", "black_capped_chickadee", "qna.yaml" + ) + with open(changed_qna_yaml, "a", encoding="utf-8") as file: + file.write("") + + pipeline_dir = testdata_path.joinpath("mock_pipelines") + date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_") + + preprocessed_dir = tmp_path.joinpath("preprocessed") + preprocess_taxonomy( + taxonomy_dir=taxonomy_dir, + output_dir=preprocessed_dir, + ) + chickadee_docs = glob.glob( + str( + preprocessed_dir.joinpath( + "documents", "knowledge_science_*", "chickadee.md" + ) + ) + ) + assert chickadee_docs + chickadee_samples_path = preprocessed_dir.joinpath( + "knowledge_science_animals_birds_black_capped_chickadee.jsonl" + ) + assert chickadee_samples_path.is_file() + + client = MagicMock() + client.server_supports_batched = False + generated_dir = tmp_path.joinpath("generated") + generate_taxonomy( + client=client, + input_dir=preprocessed_dir, + output_dir=generated_dir, + pipeline=pipeline_dir, + ) + generated_chickadee_samples_path = generated_dir.joinpath( + "knowledge_science_animals_birds_black_capped_chickadee.jsonl" + ) + assert generated_chickadee_samples_path.is_file() + + postprocessed_dir = tmp_path.joinpath("postprocessed") + postprocess_taxonomy( + input_dir=generated_dir, + output_dir=postprocessed_dir, + date_suffix=date_suffix, + pipeline=pipeline_dir, + ) + knowledge_recipe_file = postprocessed_dir.joinpath( + f"knowledge_recipe_{date_suffix}.yaml" + ) + assert knowledge_recipe_file.is_file() + skills_recipe_file = postprocessed_dir.joinpath(f"skills_recipe_{date_suffix}.yaml") + assert skills_recipe_file.is_file() + + mixed_skills_output_file = ( + f"{postprocessed_dir}/skills_train_msgs_{date_suffix}.jsonl" + ) + mix_datasets( + recipe_file=f"{postprocessed_dir}/skills_recipe_{date_suffix}.yaml", + output_file=mixed_skills_output_file, + ) + assert pathlib.Path(mixed_skills_output_file).is_file() diff --git a/tests/mockllmblock.py b/tests/mockllmblock.py new file mode 100644 index 00000000..744cd6d5 --- /dev/null +++ b/tests/mockllmblock.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Standard +import random +import string + +# Third Party +from datasets import Dataset + +# First Party +from instructlab.sdg import LLMBlock + + +def _random_string(size): + return "".join(random.choices(string.ascii_lowercase, k=size)) + + +def _add_mocked_cols(sample, block_name): + match block_name: + case "gen_questions" | "gen_grounded_questions": + sample["question"] = f"Is this a question {_random_string(8)}?" + case "eval_questions" | "eval_grounded_questions": + sample["evaluation"] = "This is an evaluation." + sample["score"] = "1" + case "gen_responses" | "gen_grounded_responses": + sample["response"] = "This is a response." + case "evaluate_qa_pair" | "evaluate_grounded_qa_pair": + sample["evaluation"] = "This is an evaluation." + sample["score"] = "2" + case "gen_contexts": + sample["context"] = f"This is a context {_random_string(8)}." + case "gen_spellcheck": + sample["spellcheck"] = sample["document"] + case "gen_knowledge": + sample["question"] = f"Is this a question {_random_string(8)}?" + sample["response"] = "This is a response." + case "eval_faithfulness_qa_pair": + sample["explanation"] = "This is an explanation." + sample["judgment"] = "YES" + case "eval_relevancy_qa_pair": + sample["feedback"] = "This is some feedback." + sample["score"] = "2" + case "eval_verify_question": + sample["explanation"] = "This is an explanation." + sample["rating"] = "1" + case _: + raise Exception( + f"Received an un-mocked LLMBlock: {block_name}. Add code in {__file__} to handle this block." + ) + return sample + + +class MockLLMBlock(LLMBlock): + def generate(self, samples: Dataset): + return samples.map(_add_mocked_cols, fn_kwargs={"block_name": self.block_name}) diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py index c2968029..0116253e 100644 --- a/tests/test_generate_data.py +++ b/tests/test_generate_data.py @@ -21,7 +21,12 @@ # First Party from instructlab.sdg import LLMBlock, PipelineContext -from instructlab.sdg.generate_data import _context_init, _sdg_init, generate_data +from instructlab.sdg.generate_data import ( + _context_init, + _locate_docling_models, + _sdg_init, + generate_data, +) TEST_SYS_PROMPT = "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant." @@ -85,7 +90,7 @@ def validate_messages_dataset(dataset_file_name, expected_samples): def validate_skill_leaf_node_dataset(dataset_file_name): ds = load_dataset("json", data_files=dataset_file_name, split="train") - assert len(ds.features) == 7 + assert len(ds.features) == 9 features = [ "task_description", "seed_context", @@ -93,6 +98,8 @@ def validate_skill_leaf_node_dataset(dataset_file_name): "seed_response", "output", "id", + "leaf_node_path", + "leaf_node_type", ] for feature in features: assert feature in ds.features @@ -518,7 +525,8 @@ def test_generate(self): ) mocked_logger.warning.assert_called() assert re.search( - "empty sdg output: knowledge_new", mocked_logger.warning.call_args.args[0] + "empty sdg output: .+knowledge_new.jsonl", + mocked_logger.warning.call_args.args[0], ) def teardown(self) -> None: @@ -567,35 +575,15 @@ def test_context_init_batch_size_optional(): assert ctx.batch_size == 20 -def test_sdg_init_docling_path_config_found(testdata_path): +def test_locate_docling_models_config_found(testdata_path): with patch.dict(os.environ): os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("mock_xdg_data_dir")) - ctx = _context_init( - None, - "mixtral", - "foo.bar", - 1, - "/checkpoint/dir", - 1, - batch_size=20, - batch_num_workers=32, - ) - _, _, _, docling_model_path = _sdg_init(ctx, "full") + docling_model_path = _locate_docling_models() assert docling_model_path == "/mock/docling-models" -def test_sdg_init_docling_path_config_not_found(testdata_path): +def test_locate_docling_models_config_not_found(testdata_path): with patch.dict(os.environ): os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("nonexistent_dir")) - ctx = _context_init( - None, - "mixtral", - "foo.bar", - 1, - "/checkpoint/dir", - 1, - batch_size=20, - batch_num_workers=32, - ) - _, _, _, docling_model_path = _sdg_init(ctx, "full") + docling_model_path = _locate_docling_models() assert docling_model_path is None diff --git a/tests/test_models.py b/tests/test_models.py index e3755553..ef4dd412 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -64,3 +64,6 @@ def test_unknown_model_family(self): "foobar", "./models/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf" ) assert "Unknown model family: foobar" in str(exc.value) + + def test_none_args(self): + assert models.get_model_family(None, None) == "merlinite" diff --git a/tests/testdata/mock_pipelines/freeform_skills.yaml b/tests/testdata/mock_pipelines/freeform_skills.yaml new file mode 100644 index 00000000..9f9043cb --- /dev/null +++ b/tests/testdata/mock_pipelines/freeform_skills.yaml @@ -0,0 +1,53 @@ +version: "1.0" +blocks: + - name: gen_questions + type: MockLLMBlock + config: + config_path: ../../../src/instructlab/sdg/configs/skills/freeform_questions.yaml + output_cols: + - question + batch_kwargs: + num_samples: 30 + drop_duplicates: + - question + - name: eval_questions + type: MockLLMBlock + config: + config_path: ../../../src/instructlab/sdg/configs/skills/evaluate_freeform_questions.yaml + output_cols: + - evaluation + - score + - name: filter_questions + type: FilterByValueBlock + config: + filter_column: score + filter_value: 1.0 + operation: eq + convert_dtype: float + drop_columns: + - evaluation + - score + - num_samples + - name: gen_responses + type: MockLLMBlock + config: + config_path: ../../../src/instructlab/sdg/configs/skills/freeform_responses.yaml + output_cols: + - response + - name: evaluate_qa_pair + type: MockLLMBlock + config: + config_path: ../../../src/instructlab/sdg/configs/skills/evaluate_freeform_pair.yaml + output_cols: + - evaluation + - score + - name: filter_qa_pair + type: FilterByValueBlock + config: + filter_column: score + filter_value: 2.0 + operation: ge + convert_dtype: float + drop_columns: + - evaluation + - score diff --git a/tests/testdata/mock_pipelines/grounded_skills.yaml b/tests/testdata/mock_pipelines/grounded_skills.yaml new file mode 100644 index 00000000..4aceaf0f --- /dev/null +++ b/tests/testdata/mock_pipelines/grounded_skills.yaml @@ -0,0 +1,70 @@ +version: "1.0" +blocks: + - name: gen_contexts + type: MockLLMBlock + config: + config_path: ../../../src/instructlab/sdg/configs/skills/contexts.yaml + output_cols: + - context + gen_kwargs: + temperature: 0.7 + max_tokens: 2048 + n: 10 + seed: 42 + drop_duplicates: + - context + - name: gen_grounded_questions + type: MockLLMBlock + config: + config_path: ../../../src/instructlab/sdg/configs/skills/grounded_questions.yaml + output_cols: + - question + batch_kwargs: + num_samples: 3 + drop_duplicates: + - question + - name: eval_grounded_questions + type: MockLLMBlock + config: + config_path: ../../../src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml + output_cols: + - evaluation + - score + - name: filter_grounded_questions + type: FilterByValueBlock + config: + filter_column: score + filter_value: 1.0 + operation: eq + convert_dtype: float + drop_columns: + - evaluation + - score + - num_samples + - name: gen_grounded_responses + type: MockLLMBlock + config: + config_path: ../../../src/instructlab/sdg/configs/skills/grounded_responses.yaml + output_cols: + - response + - name: evaluate_grounded_qa_pair + type: MockLLMBlock + config: + config_path: ../../../src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml + output_cols: + - evaluation + - score + - name: filter_grounded_qa_pair + type: FilterByValueBlock + config: + filter_column: score + filter_value: 2.0 + operation: ge + convert_dtype: float + - name: combine_question_and_context + type: CombineColumnsBlock + config: + columns: + - context + - question + output_col: question diff --git a/tests/testdata/mock_pipelines/knowledge.yaml b/tests/testdata/mock_pipelines/knowledge.yaml new file mode 100644 index 00000000..1eb2d066 --- /dev/null +++ b/tests/testdata/mock_pipelines/knowledge.yaml @@ -0,0 +1,113 @@ +version: "1.0" +blocks: + - name: duplicate_document_col + type: DuplicateColumnsBlock + config: + columns_map: + document: base_document + + - name: gen_spellcheck + type: MockLLMBlock + config: + config_path: ../../../src/instructlab/sdg/configs/knowledge/spellcheck.yaml + output_cols: + - spellcheck + gen_kwargs: + max_tokens: 2048 + + - name: flatten_auxiliary_columns + type: FlattenColumnsBlock + config: + var_cols: + - spellcheck + - base_document + value_name: corrected_document + var_name: dataset_type + + - name: rename_to_document_column + type: RenameColumnsBlock + config: + columns_map: + document: raw_document + corrected_document: document + + - name: gen_knowledge + type: MockLLMBlock + config: + config_path: ../../../src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml + output_cols: + - question + - response + parser_kwargs: + parser_name: custom + parsing_pattern: '\[(?:Question|QUESTION)\]\s*(.*?)\s*\[(?:Answer|ANSWER)\]\s*(.*?)\s*(?=\[(?:Question|QUESTION)\]|$)' + parser_cleanup_tags: + - "[END]" + - "[End]" + gen_kwargs: + max_tokens: 2048 + drop_duplicates: + - question + - name: eval_faithfulness_qa_pair + type: MockLLMBlock + config: + config_path: ../../../src/instructlab/sdg/configs/knowledge/evaluate_faithfulness.yaml + output_cols: + - explanation + - judgment + gen_kwargs: + max_tokens: 2048 + - name: filter_faithfulness + type: FilterByValueBlock + config: + filter_column: judgment + filter_value: "YES" + operation: eq + drop_columns: + - judgment + - explanation + - name: eval_relevancy_qa_pair + type: MockLLMBlock + config: + config_path: ../../../src/instructlab/sdg/configs/knowledge/evaluate_relevancy.yaml + output_cols: + - feedback + - score + gen_kwargs: + max_tokens: 2048 + - name: filter_relevancy + type: FilterByValueBlock + config: + filter_column: score + filter_value: 2.0 + operation: eq + convert_dtype: float + drop_columns: + - feedback + - score + - name: eval_verify_question + type: MockLLMBlock + config: + config_path: ../../../src/instructlab/sdg/configs/knowledge/evaluate_question.yaml + output_cols: + - explanation + - rating + gen_kwargs: + max_tokens: 2048 + - name: filter_verify_question + type: FilterByValueBlock + config: + filter_column: rating + filter_value: 1.0 + operation: eq + convert_dtype: float + drop_columns: + - explanation + - rating + - __index_level_0__ + +datamixing: + auxiliary_instructions: + spellcheck: + - Correct any spelling errors in the document and output the corrected version. + - Rewrite the document to remove any spelling errors. From 6c8544e5abdeb73c6675264298886e906a610c23 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Wed, 15 Jan 2025 13:16:17 -0500 Subject: [PATCH 2/3] Parameterize # of seed examples when converting to test data Instead of hardcoding this to always be 3, add a parameter with a default of 3 when converting our seed examples to the test output dataset. Co-authored-by: Aakanksha Duggal Signed-off-by: Ben Browning --- src/instructlab/sdg/generate_data.py | 4 ++-- tests/functional/test_granular_api.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index a14a190a..a94683a4 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -123,9 +123,9 @@ def _gen_train_data( jldump(messages_data, output_file_messages) -def _knowledge_seed_example_to_test_data(seed_example, system_prompt): +def _knowledge_seed_example_to_test_data(seed_example, system_prompt, num_iterations=3): res = [] - for i in range(3): + for i in range(num_iterations): idx = i + 1 user = seed_example[f"icl_query_{idx}"] + "\n" + seed_example["icl_document"] test_sample = { diff --git a/tests/functional/test_granular_api.py b/tests/functional/test_granular_api.py index 79207368..33de8a25 100644 --- a/tests/functional/test_granular_api.py +++ b/tests/functional/test_granular_api.py @@ -46,9 +46,11 @@ def test_granular_api_end_to_end(testdata_path: pathlib.Path, tmp_path: pathlib. date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_") preprocessed_dir = tmp_path.joinpath("preprocessed") + teacher_model_path = testdata_path.joinpath("models/instructlab/granite-7b-lab") preprocess_taxonomy( taxonomy_dir=taxonomy_dir, output_dir=preprocessed_dir, + teacher_model_path=teacher_model_path, ) chickadee_docs = glob.glob( str( From 1f9394d6dfe890310c29befc4c4ad0461635351c Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Wed, 15 Jan 2025 14:50:26 -0500 Subject: [PATCH 3/3] Add examples for how to use data mixing This adds a new docs/examples/mix_datasets folders with a couple of example recipes, two sample datasets, and an example_mixing.py Python script to show how to mix datasets. This also adds a test_examples.py file that actually runs out examples, ensuring they work without error and generate the expected mixed datasets. Signed-off-by: Ben Browning --- .gitignore | 3 + .../mix_datasets/concatenate_recipe.yaml | 8 + docs/examples/mix_datasets/dataset_1.jsonl | 5 + docs/examples/mix_datasets/dataset_2.jsonl | 5 + docs/examples/mix_datasets/example_mixing.py | 18 ++ .../mix_datasets/weighted_recipe.yaml | 9 + tests/functional/conftest.py | 7 + tests/functional/test_examples.py | 40 +++++ tests/functional/test_granular_api.py | 170 ++++++++++-------- tests/taxonomy.py | 7 +- tests/test_generate_data.py | 8 +- 11 files changed, 203 insertions(+), 77 deletions(-) create mode 100644 docs/examples/mix_datasets/concatenate_recipe.yaml create mode 100644 docs/examples/mix_datasets/dataset_1.jsonl create mode 100644 docs/examples/mix_datasets/dataset_2.jsonl create mode 100644 docs/examples/mix_datasets/example_mixing.py create mode 100644 docs/examples/mix_datasets/weighted_recipe.yaml create mode 100644 tests/functional/test_examples.py diff --git a/.gitignore b/.gitignore index b2911a08..70a55104 100644 --- a/.gitignore +++ b/.gitignore @@ -167,3 +167,6 @@ cython_debug/ # IDEs .vscode/ + +# SDG examples output +docs/examples/**/output \ No newline at end of file diff --git a/docs/examples/mix_datasets/concatenate_recipe.yaml b/docs/examples/mix_datasets/concatenate_recipe.yaml new file mode 100644 index 00000000..c3d95923 --- /dev/null +++ b/docs/examples/mix_datasets/concatenate_recipe.yaml @@ -0,0 +1,8 @@ +# An example of how to concatenate two datasets +# Each dataset has a sampling_size of 1.0 to take all samples from both +datasets: +- path: dataset_1.jsonl + sampling_size: 1.0 +- path: dataset_2.jsonl + sampling_size: 1.0 +sys_prompt: I am a reliable AI assistant. diff --git a/docs/examples/mix_datasets/dataset_1.jsonl b/docs/examples/mix_datasets/dataset_1.jsonl new file mode 100644 index 00000000..89059add --- /dev/null +++ b/docs/examples/mix_datasets/dataset_1.jsonl @@ -0,0 +1,5 @@ +{"id": "dataset_1_1", "messages": [], "metadata": {}} +{"id": "dataset_1_2", "messages": [], "metadata": {}} +{"id": "dataset_1_3", "messages": [], "metadata": {}} +{"id": "dataset_1_4", "messages": [], "metadata": {}} +{"id": "dataset_1_5", "messages": [], "metadata": {}} diff --git a/docs/examples/mix_datasets/dataset_2.jsonl b/docs/examples/mix_datasets/dataset_2.jsonl new file mode 100644 index 00000000..35e063e9 --- /dev/null +++ b/docs/examples/mix_datasets/dataset_2.jsonl @@ -0,0 +1,5 @@ +{"id": "dataset_2_1", "messages": [], "metadata": {}} +{"id": "dataset_2_2", "messages": [], "metadata": {}} +{"id": "dataset_2_3", "messages": [], "metadata": {}} +{"id": "dataset_2_4", "messages": [], "metadata": {}} +{"id": "dataset_2_5", "messages": [], "metadata": {}} diff --git a/docs/examples/mix_datasets/example_mixing.py b/docs/examples/mix_datasets/example_mixing.py new file mode 100644 index 00000000..5382d9cb --- /dev/null +++ b/docs/examples/mix_datasets/example_mixing.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Standard +from pathlib import Path + +# First Party +from instructlab.sdg import mix_datasets + +output_dir = Path(__file__).parent.joinpath("output") +output_dir.mkdir(exist_ok=True) + +concatenate_recipe_yaml = Path(__file__).parent.joinpath("concatenate_recipe.yaml") +concatenated_output_jsonl = output_dir.joinpath("concatenated.jsonl") +mix_datasets(concatenate_recipe_yaml, concatenated_output_jsonl) + +weighted_recipe_yaml = Path(__file__).parent.joinpath("weighted_recipe.yaml") +weighted_output_jsonl = output_dir.joinpath("weighted.jsonl") +mix_datasets(weighted_recipe_yaml, weighted_output_jsonl) diff --git a/docs/examples/mix_datasets/weighted_recipe.yaml b/docs/examples/mix_datasets/weighted_recipe.yaml new file mode 100644 index 00000000..39d6f17b --- /dev/null +++ b/docs/examples/mix_datasets/weighted_recipe.yaml @@ -0,0 +1,9 @@ +# An example of how to weight one dataset over another +# Dataset 1 has a sampling size of 2.0 to double its samples +# Dataset 2 has a sampling size of 0.2 to take 20% of its samples +datasets: +- path: dataset_1.jsonl + sampling_size: 2.0 +- path: dataset_2.jsonl + sampling_size: 0.2 +sys_prompt: I am a reliable AI assistant. diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index c1793a94..9c5ec111 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -6,9 +6,16 @@ import pytest TESTS_PATH = pathlib.Path(__file__).parent.parent.absolute() +EXAMPLES_PATH = TESTS_PATH.parent.joinpath("docs", "examples") @pytest.fixture def testdata_path() -> typing.Generator[pathlib.Path, None, None]: """Path to local test data directory""" yield TESTS_PATH / "testdata" + + +@pytest.fixture +def examples_path() -> typing.Generator[pathlib.Path, None, None]: + """Path to examples directory""" + yield EXAMPLES_PATH diff --git a/tests/functional/test_examples.py b/tests/functional/test_examples.py new file mode 100644 index 00000000..c1fe8bd0 --- /dev/null +++ b/tests/functional/test_examples.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Standard +import pathlib +import shutil +import subprocess +import sys + +# First Party +from instructlab.sdg.utils.json import jlload + + +def test_example_mixing(tmp_path: pathlib.Path, examples_path: pathlib.Path): + example_copy_path = tmp_path.joinpath("mix_datasets") + shutil.copytree(examples_path.joinpath("mix_datasets"), example_copy_path) + script = example_copy_path.joinpath("example_mixing.py") + subprocess.check_call([sys.executable, str(script)], text=True) + + concatenated = jlload(example_copy_path.joinpath("output", "concatenated.jsonl")) + assert len(concatenated) == 10 + from_ds_1 = [] + from_ds_2 = [] + for sample in concatenated: + if sample["id"].startswith("dataset_1"): + from_ds_1.append(sample) + else: + from_ds_2.append(sample) + assert len(from_ds_1) == len(from_ds_2) == 5 + + weighted = jlload(example_copy_path.joinpath("output", "weighted.jsonl")) + assert len(weighted) == 11 + from_ds_1 = [] + from_ds_2 = [] + for sample in weighted: + if sample["id"].startswith("dataset_1"): + from_ds_1.append(sample) + else: + from_ds_2.append(sample) + assert len(from_ds_1) == 10 + assert len(from_ds_2) == 1 diff --git a/tests/functional/test_granular_api.py b/tests/functional/test_granular_api.py index 33de8a25..6b83bbc6 100644 --- a/tests/functional/test_granular_api.py +++ b/tests/functional/test_granular_api.py @@ -4,10 +4,13 @@ from datetime import datetime from unittest.mock import MagicMock import glob +import os import pathlib +import unittest # Third Party import git +import pytest # First Party from instructlab.sdg import BlockRegistry @@ -20,6 +23,7 @@ # Local from ..mockllmblock import MockLLMBlock +from ..taxonomy import load_test_skills def _clone_instructlab_taxonomy(taxonomy_dir): @@ -29,75 +33,99 @@ def _clone_instructlab_taxonomy(taxonomy_dir): repo.git.checkout(taxonomy_commit) -def test_granular_api_end_to_end(testdata_path: pathlib.Path, tmp_path: pathlib.Path): - # Registry our mock block so we can reference it in pipelines - BlockRegistry.register("MockLLMBlock")(MockLLMBlock) - - # Clone a taxonomy and edit 1 file in it - taxonomy_dir = tmp_path.joinpath("taxonomy") - _clone_instructlab_taxonomy(taxonomy_dir) - changed_qna_yaml = taxonomy_dir.joinpath( - "knowledge", "science", "animals", "birds", "black_capped_chickadee", "qna.yaml" - ) - with open(changed_qna_yaml, "a", encoding="utf-8") as file: - file.write("") - - pipeline_dir = testdata_path.joinpath("mock_pipelines") - date_suffix = datetime.now().replace(microsecond=0).isoformat().replace(":", "_") - - preprocessed_dir = tmp_path.joinpath("preprocessed") - teacher_model_path = testdata_path.joinpath("models/instructlab/granite-7b-lab") - preprocess_taxonomy( - taxonomy_dir=taxonomy_dir, - output_dir=preprocessed_dir, - teacher_model_path=teacher_model_path, - ) - chickadee_docs = glob.glob( - str( - preprocessed_dir.joinpath( - "documents", "knowledge_science_*", "chickadee.md" - ) +class TestGranularAPI(unittest.TestCase): + @pytest.fixture(autouse=True) + def _init_taxonomy(self, taxonomy_dir, testdata_path, tmp_path): + self.test_taxonomy = taxonomy_dir + self.testdata_path = testdata_path + self.tmp_path = tmp_path + + def setUp(self): + test_valid_knowledge_skill_file = self.testdata_path.joinpath( + "test_valid_knowledge_skill.yaml" + ) + untracked_knowledge_file = os.path.join("knowledge", "new", "qna.yaml") + test_valid_knowledge_skill = load_test_skills(test_valid_knowledge_skill_file) + self.test_taxonomy.create_untracked( + untracked_knowledge_file, test_valid_knowledge_skill + ) + + def file_list(self): + return glob.glob(str(self.tmp_path.joinpath("**/*")), recursive=True) + + def test_granular_api_end_to_end(self): + # Registry our mock block so we can reference it in pipelines + BlockRegistry.register("MockLLMBlock")(MockLLMBlock) + + # Clone a taxonomy and edit 1 file in it + taxonomy_dir = self.tmp_path + + pipeline_dir = self.testdata_path.joinpath("mock_pipelines") + date_suffix = ( + datetime.now().replace(microsecond=0).isoformat().replace(":", "_") + ) + + preprocessed_dir = self.tmp_path.joinpath("preprocessed") + teacher_model_path = self.testdata_path.joinpath( + "models/instructlab/granite-7b-lab" + ) + preprocess_taxonomy( + taxonomy_dir=taxonomy_dir, + output_dir=preprocessed_dir, + teacher_model_path=teacher_model_path, + ) + docs = glob.glob( + str(preprocessed_dir.joinpath("documents", "knowledge_new_*", "phoenix.md")) + ) + assert docs, f"Expected docs not found in {self.file_list()}" + samples_path = preprocessed_dir.joinpath("knowledge_new.jsonl") + assert ( + samples_path.is_file() + ), f"Expected samples file not found in {self.file_list()}" + + client = MagicMock() + client.server_supports_batched = False + generated_dir = self.tmp_path.joinpath("generated") + generate_taxonomy( + client=client, + input_dir=preprocessed_dir, + output_dir=generated_dir, + pipeline=pipeline_dir, + num_cpus=1, # Test is faster running on a single CPU vs forking + batch_size=0, # Disable batch for tiny dataset and fastest test + ) + generated_samples_path = generated_dir.joinpath("knowledge_new.jsonl") + assert ( + generated_samples_path.is_file() + ), f"Generated samples not found in {self.file_list()}" + + postprocessed_dir = self.tmp_path.joinpath("postprocessed") + postprocess_taxonomy( + input_dir=generated_dir, + output_dir=postprocessed_dir, + date_suffix=date_suffix, + pipeline=pipeline_dir, + ) + knowledge_recipe_file = postprocessed_dir.joinpath( + f"knowledge_recipe_{date_suffix}.yaml" + ) + assert ( + knowledge_recipe_file.is_file() + ), f"Generated knowledge recipe file not found in {self.file_list()}" + skills_recipe_file = postprocessed_dir.joinpath( + f"skills_recipe_{date_suffix}.yaml" + ) + assert ( + skills_recipe_file.is_file() + ), f"Generated skills recipe file not found in {self.file_list()}" + + mixed_skills_output_file = ( + f"{postprocessed_dir}/skills_train_msgs_{date_suffix}.jsonl" + ) + mix_datasets( + recipe_file=f"{postprocessed_dir}/skills_recipe_{date_suffix}.yaml", + output_file=mixed_skills_output_file, ) - ) - assert chickadee_docs - chickadee_samples_path = preprocessed_dir.joinpath( - "knowledge_science_animals_birds_black_capped_chickadee.jsonl" - ) - assert chickadee_samples_path.is_file() - - client = MagicMock() - client.server_supports_batched = False - generated_dir = tmp_path.joinpath("generated") - generate_taxonomy( - client=client, - input_dir=preprocessed_dir, - output_dir=generated_dir, - pipeline=pipeline_dir, - ) - generated_chickadee_samples_path = generated_dir.joinpath( - "knowledge_science_animals_birds_black_capped_chickadee.jsonl" - ) - assert generated_chickadee_samples_path.is_file() - - postprocessed_dir = tmp_path.joinpath("postprocessed") - postprocess_taxonomy( - input_dir=generated_dir, - output_dir=postprocessed_dir, - date_suffix=date_suffix, - pipeline=pipeline_dir, - ) - knowledge_recipe_file = postprocessed_dir.joinpath( - f"knowledge_recipe_{date_suffix}.yaml" - ) - assert knowledge_recipe_file.is_file() - skills_recipe_file = postprocessed_dir.joinpath(f"skills_recipe_{date_suffix}.yaml") - assert skills_recipe_file.is_file() - - mixed_skills_output_file = ( - f"{postprocessed_dir}/skills_train_msgs_{date_suffix}.jsonl" - ) - mix_datasets( - recipe_file=f"{postprocessed_dir}/skills_recipe_{date_suffix}.yaml", - output_file=mixed_skills_output_file, - ) - assert pathlib.Path(mixed_skills_output_file).is_file() + assert pathlib.Path( + mixed_skills_output_file + ).is_file(), f"Generated mixed output not found in {self.file_list()}" diff --git a/tests/taxonomy.py b/tests/taxonomy.py index 227c2534..01904ad3 100644 --- a/tests/taxonomy.py +++ b/tests/taxonomy.py @@ -2,7 +2,7 @@ # Standard from pathlib import Path -from typing import Any, Dict, List +from typing import Any, Dict, List, Union import shutil # Third Party @@ -68,3 +68,8 @@ def __enter__(self): def __exit__(self, *args): self.teardown() + + +def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]: + with open(skills_file_path, "r", encoding="utf-8") as skills_file: + return yaml.safe_load(skills_file) diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py index 0116253e..0e718383 100644 --- a/tests/test_generate_data.py +++ b/tests/test_generate_data.py @@ -28,6 +28,9 @@ generate_data, ) +# Local +from .taxonomy import load_test_skills + TEST_SYS_PROMPT = "I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant." TEST_TAXONOMY_BASE = "main" @@ -232,11 +235,6 @@ def add_question_mark(q): return train_samples -def load_test_skills(skills_file_path) -> Union[Dict[str, Any], None]: - with open(skills_file_path, "r", encoding="utf-8") as skills_file: - return yaml.safe_load(skills_file) - - def _noop_llmblock_generate(self, samples): """Generate mock output based on input samples.