diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 4bd76c01..e4ec818c 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -10,6 +10,7 @@ import logging import os import time +import yaml # Third Party # instructlab - All of these need to go away (other than sdg) - issue #6 @@ -220,19 +221,22 @@ def _sdg_init(ctx, pipeline): data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")] data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs()) - sdg_models_path = docling_models_path = None + sdg_models_path = docling_models_path = None for d in data_dirs: if os.path.exists(os.path.join(d, "models")): - sdg_models_path = os.path.join(d, "models") - break - - if sdg_models_path is not None: - try: - with open(os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8") as file: - config = yaml.safe_load(file) - docling_models_path = config['models'][0]['path'] - except (FileNotFoundError, NotADirectoryError, PermissionsError) as e: - log.warning(f"unable to read docling models path from config.yaml") + sdg_models_path = os.path.join(d, "models") + break + + if sdg_models_path is not None: + try: + with open( + os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8" + ) as file: + config = yaml.safe_load(file) + docling_models_path = config["models"][0]["path"] + except (FileNotFoundError, NotADirectoryError, PermissionsError) as e: + logger.warning(f"unable to read docling models path from config.yaml") + for d in data_dirs: pipeline_path = os.path.join(d, "pipelines", pipeline) if os.path.exists(pipeline_path): diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 6dd1ccf6..ac6f0a0a 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -6,7 +6,6 @@ from typing import DefaultDict, Iterable, List, Tuple import json import logging -import os import re # Third Party @@ -217,9 +216,7 @@ def chunk_documents(self) -> List: return [] if self.docling_model_path is None: - logger.info( - f"Docling models not found on disk, downloading models..." - ) + logger.info("Docling models not found on disk, downloading models...") self.docling_model_path = StandardPdfPipeline.download_models_hf() else: logger.info("Found the docling models")