Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure knowledge docs are cloned into unique dirs #416

Merged
merged 1 commit into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

# Standard
from pathlib import Path
from tempfile import mkdtemp
from typing import Dict, List, Tuple, Union
import glob
import logging
Expand Down Expand Up @@ -257,14 +258,20 @@ def _read_taxonomy_file(
try:
# get seed instruction data
tax_path = "->".join(taxonomy.path.parent.parts)
leaf_node_path = tax_path.replace("->", "_")
contents = taxonomy.contents
task_description = contents.get("task_description", None)
domain = contents.get("domain")
documents = contents.get("document")
document_contents, doc_filepaths = None, None
if documents:
os.makedirs(document_output_dir, exist_ok=True)
unique_output_dir = mkdtemp(
prefix=f"{leaf_node_path}_", dir=document_output_dir
)
document_contents, doc_filepaths = _get_documents(
source=documents, document_output_dir=document_output_dir
source=documents,
document_output_dir=unique_output_dir,
)
logger.debug("Content from git repo fetched")

Expand Down
44 changes: 28 additions & 16 deletions tests/test_generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,10 @@ def validate_phase_leaf_node_dataset(dataset_file_name):
assert ds.features["messages"][0]["role"].dtype == "string"


def validate_recipe(recipe_file_name):
def validate_recipe(recipe_file_name, num_datasets):
with open(recipe_file_name, encoding="utf-8") as fp:
yaml_contents = yaml.safe_load(fp)
assert len(yaml_contents["datasets"]) == 1
assert len(yaml_contents["datasets"]) == num_datasets
assert yaml_contents["datasets"][0]["path"].endswith(".jsonl")
assert "sampling_size" in yaml_contents["datasets"][0]
assert yaml_contents["metadata"]["sys_prompt"] == TEST_SYS_PROMPT
Expand Down Expand Up @@ -344,7 +344,7 @@ def test_generate(self):
if name.endswith("compositional_skills_new.jsonl"):
validate_skill_leaf_node_dataset(matches[0])
elif name.startswith("skills_recipe_"):
validate_recipe(matches[0])
validate_recipe(matches[0], 1)
elif name.startswith("skills_train_msgs_"):
validate_mixed_dataset(matches[0])

Expand Down Expand Up @@ -374,13 +374,19 @@ def setUp(self):
TEST_DATA_DIR, "test_valid_knowledge_skill.yaml"
)
tracked_knowledge_file = os.path.join("knowledge ", "tracked", "qna.yaml")
untracked_knowledge_file = os.path.join("knowledge", "new", "qna.yaml")
# Explicitly add 2 files here to ensure multiple knowledge leaf nodes
# don't conflict in anything like document_output_dir for knowledge docs
untracked_knowledge_file1 = os.path.join("knowledge", "new1", "qna.yaml")
untracked_knowledge_file2 = os.path.join("knowledge", "new2", "qna.yaml")
test_valid_knowledge_skill = load_test_skills(test_valid_knowledge_skill_file)
self.test_taxonomy.add_tracked(
tracked_knowledge_file, test_valid_knowledge_skill
)
self.test_taxonomy.create_untracked(
untracked_knowledge_file, test_valid_knowledge_skill
untracked_knowledge_file1, test_valid_knowledge_skill
)
self.test_taxonomy.create_untracked(
untracked_knowledge_file2, test_valid_knowledge_skill
)
self.expected_test_samples = generate_test_samples(test_valid_knowledge_skill)
self.expected_train_samples = generate_train_samples(test_valid_knowledge_skill)
Expand Down Expand Up @@ -412,40 +418,46 @@ def test_generate(self):
elif name.startswith("messages_"):
validate_messages_dataset(matches[0], self.expected_train_samples)

node_p07_file = os.path.join("node_datasets_*", "knowledge_new_p07.jsonl")
node_p10_file = os.path.join("node_datasets_*", "knowledge_new_p10.jsonl")
node1_p07_file = os.path.join("node_datasets_*", "knowledge_new1_p07.jsonl")
node1_p10_file = os.path.join("node_datasets_*", "knowledge_new1_p10.jsonl")
node2_p07_file = os.path.join("node_datasets_*", "knowledge_new2_p07.jsonl")
node2_p10_file = os.path.join("node_datasets_*", "knowledge_new2_p10.jsonl")
for name in [
"skills_recipe_*.yaml",
"skills_train_*.jsonl",
"knowledge_recipe_*.yaml",
"knowledge_train_msgs_*.jsonl",
node_p07_file,
node_p10_file,
node1_p07_file,
node1_p10_file,
node2_p07_file,
node2_p10_file,
]:
matches = glob.glob(os.path.join(self.tmp_path, name))
assert len(matches) == 1
if name.endswith("knowledge_new_p07.jsonl") or name.endswith(
"knowledge_new_p10.jsonl"
if name.endswith("knowledge_new1_p07.jsonl") or name.endswith(
"knowledge_new1_p10.jsonl"
):
validate_phase_leaf_node_dataset(matches[0])
elif name.startswith("skills_recipe_") or name.startswith(
"knowledge_recipe_"
):
validate_recipe(matches[0])
validate_recipe(matches[0], 2)
elif name.startswith("skills_train_msgs_") or name.startswith(
"knowledge_train_msgs_"
):
validate_mixed_dataset(matches[0])

for name in [
"knowledge_new_task.yaml",
"mmlubench_knowledge_new.jsonl",
"knowledge_new1_task.yaml",
"mmlubench_knowledge_new1.jsonl",
"knowledge_new2_task.yaml",
"mmlubench_knowledge_new2.jsonl",
]:
matches = glob.glob(os.path.join(self.tmp_path, "node_datasets_*", name))
assert len(matches) == 1
if name == "knowledge_new_task.yaml":
if name == "knowledge_new1_task.yaml":
validate_lm_eval_task(matches[0])
elif name == "mmlubench_knowledge_new.jsonl":
elif name == "mmlubench_knowledge_new1.jsonl":
validate_mmlubench_dataset(matches[0])

def teardown(self) -> None:
Expand Down
Loading