From c562f8c4b0b0d6239e6f785168cdda43d094d340 Mon Sep 17 00:00:00 2001
From: Khaled Sulayman <ksulayma@redhat.com>
Date: Sat, 23 Nov 2024 14:32:58 -0500
Subject: [PATCH 01/11] Replace DocumentChunker factory with docling-based
 chunker by default

Signed-off-by: Khaled Sulayman <ksulayma@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 384 ++++++++++++++------------
 src/instructlab/sdg/utils/taxonomy.py |   6 +-
 2 files changed, 214 insertions(+), 176 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index d8de36de..b292239c 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -3,7 +3,7 @@
 from collections import defaultdict
 from enum import Enum
 from pathlib import Path
-from typing import DefaultDict, Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional, Tuple
 import json
 import logging
 import re
@@ -26,6 +26,7 @@
 
 logger = logging.getLogger(__name__)
 _DEFAULT_CHUNK_OVERLAP = 100
+SUPPORTED_FILETYPES = [".pdf", ".md"]
 
 
 def _num_tokens_from_words(num_words) -> int:
@@ -68,185 +69,211 @@ def resolve_ocr_options() -> OcrOptions:
         return None
 
 
-class FileTypes(Enum):
-    MD = ".md"
-    PDF = ".pdf"
-
-
-class ChunkerBase(ABC):
-    @abstractmethod
-    def chunk_documents(self):
-        pass
-
-
-class DocumentChunker:
-    """A factory chunker class that instantiates the applicable chunker
-
-    Currently, only Markdown and PDF are supported. For Markdown, returns
-    TextSplitChunker, and for PDF, returns ContextAwareChunker"""
-
-    def __new__(
-        cls,
-        leaf_node,
-        taxonomy_path,
-        output_dir: Path,
-        server_ctx_size=4096,
-        chunk_word_count=1024,
-        tokenizer_model_name: Optional[str] = None,
-        docling_model_path: Optional[str] = None,
-    ):
-        """Insantiate the appropriate chunker for the provided document
-
-        Args:
-            leaf_node: a leaf node dict containing "documents",
-                "filepaths", and "taxonomy_path" keys
-            output_dir (Path): directory where artifacts should be stored
-            server_ctx_size (int): Context window size of server
-            chunk_word_count (int): Maximum number of words to chunk a document
-            tokenizer_model_name (Optional[str]): name of huggingface model to get
-                tokenizer from
-        Returns:
-            TextSplitChunker | ContextAwareChunker: Object of the appropriate
-                chunker class for the provided filetype
-        """
-        documents = leaf_node[0]["documents"]
-
-        if not isinstance(taxonomy_path, Path):
-            taxonomy_path = Path(taxonomy_path)
-
-        if isinstance(documents, str):
-            documents = [documents]
-            logger.info(
-                "Converted single string into a list of string. Assumed the string passed in is the document. Normally, chunk_document() should take a list as input."
-            )
-        elif not isinstance(documents, list):
-            raise TypeError(
-                "Expected: documents to be a list, but got {}".format(type(documents))
-            )
-
-        filepaths = leaf_node[0]["filepaths"]
-
-        doc_dict = cls._split_docs_by_filetype(documents, filepaths)
-        if len(doc_dict.keys()) > 1:
-            raise ValueError("Received multiple document types")
-        if len(doc_dict.keys()) < 1:
-            raise ValueError("Received no document types")
-
-        if FileTypes.MD in doc_dict:
-            doc_contents = [d for d, _ in doc_dict[FileTypes.MD]]
-            return TextSplitChunker(
-                doc_contents,
-                server_ctx_size,
-                chunk_word_count,
-                output_dir,
-            )
-
-        if FileTypes.PDF in doc_dict:
-            doc_paths = [p for _, p in doc_dict[FileTypes.PDF]]
-            return ContextAwareChunker(
-                doc_paths,
-                filepaths,
-                output_dir,
-                chunk_word_count,
-                tokenizer_model_name,
-                docling_model_path=docling_model_path,
-            )
-
-    @staticmethod
-    def _split_docs_by_filetype(
-        documents: List[str], filepaths: List[Path]
-    ) -> DefaultDict[FileTypes, List[Tuple[str, Path]]]:
-        """Separate documents into lists based on their filetype.
-
-        Currently, only Markdown and PDF are supported.
-        Args:
-            documents (List[str]): A list of the document contents as strings
-            filepaths (List[Path]): Corresponding document filepaths
-        Returns:
-            DefaultDict: Dictionary with either ".md" or ".pdf" as a key.
-                Markdown items contain document contents, PDF items contain
-                paths to documents.
-        """
-        doc_dict = defaultdict(list)
-        for doc, path in zip(documents, filepaths):
-            if path.suffix == ".md":
-                # append doc contents
-                doc_dict[FileTypes.MD].append((doc, path))
-            elif path.suffix == ".pdf":
-                # append doc paths
-                doc_dict[FileTypes.PDF].append((doc, path))
-            else:
-                raise ValueError(
-                    f"Received document of type .{path.suffix}, which is not a supported filetype"
-                )
-        return doc_dict
-
+def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
+    """Split document paths into a dict of lists based on their file extension.
+    """
+    document_dict = defaultdict(list)
+    for path in document_paths:
+        filetype = path.suffix
+        if filetype not in SUPPORTED_FILETYPES:
+            raise ValueError(f"Provided unsupported filetype {filetype}")
+
+        document_dict[filetype].append(path)
+
+    return dict(document_dict)
+
+
+
+# class DocumentChunker:
+#     """A factory chunker class that instantiates the applicable chunker
+
+#     Currently, only Markdown and PDF are supported. For Markdown, returns
+#     TextSplitChunker, and for PDF, returns ContextAwareChunker"""
+
+#     def __new__(
+#         cls,
+#         doc_filepaths: List[Path],
+#         output_dir: Path,
+#         server_ctx_size=4096,
+#         chunk_word_count=1024,
+#         tokenizer_model_name: Optional[str] = None,
+#         docling_model_path: Optional[str] = None,
+#     ):
+#         """Insantiate the appropriate chunker for the provided document
+
+#         Args:
+#             leaf_node: a leaf node dict containing "documents",
+#                 "filepaths", and "taxonomy_path" keys
+#             output_dir (Path): directory where artifacts should be stored
+#             server_ctx_size (int): Context window size of server
+#             chunk_word_count (int): Maximum number of words to chunk a document
+#             tokenizer_model_name (Optional[str]): name of huggingface model to get
+#                 tokenizer from
+#         Returns:
+#             TextSplitChunker | ContextAwareChunker: Object of the appropriate
+#                 chunker class for the provided filetype
+#         """
+#         documents = leaf_node[0]["documents"]
+
+#         if not isinstance(taxonomy_path, Path):
+#             taxonomy_path = Path(taxonomy_path)
+
+#         if isinstance(documents, str):
+#             documents = [documents]
+#             logger.info(
+#                 "Converted single string into a list of string. Assumed the string passed in is the document. Normally, chunk_document() should take a list as input."
+#             )
+#         elif not isinstance(documents, list):
+#             raise TypeError(
+#                 "Expected: documents to be a list, but got {}".format(type(documents))
+#             )
+
+#         filepaths = leaf_node[0]["filepaths"]
+
+#         doc_dict = cls._split_docs_by_filetype(documents, filepaths)
+#         if len(doc_dict.keys()) > 1:
+#             raise ValueError("Received multiple document types")
+#         if len(doc_dict.keys()) < 1:
+#             raise ValueError("Received no document types")
+
+#         if SupportedFileTypes.MD in doc_dict:
+#             doc_contents = [d for d, _ in doc_dict[SupportedFileTypes.MD]]
+#             # return TextSplitChunker(
+#             #     doc_contents,
+#             #     server_ctx_size,
+#             #     chunk_word_count,
+#             #     output_dir,
+#             # )
+
+#             # TODO CHUNK AS MARKDOWN
+#             pass
+
+#         if SupportedFileTypes.PDF in doc_dict:
+#             doc_paths = [p for _, p in doc_dict[SupportedFileTypes.PDF]]
+#             # return ContextAwareChunker(
+#             #     doc_paths,
+#             #     filepaths,
+#             #     output_dir,
+#             #     chunk_word_count,
+#             #     tokenizer_model_name,
+#             #     docling_model_path=docling_model_path,
+#             # )
+
+#             # TODO CHUNK AS PDF
+#             pass
+
+#     @staticmethod
+#     def _split_docs_by_filetype(
+#         documents: List[str], filepaths: List[Path]
+#     ) -> DefaultDict[SupportedFileTypes, List[Tuple[str, Path]]]:
+#         """Separate documents into lists based on their filetype.
+
+#         Currently, only Markdown and PDF are supported.
+#         Args:
+#             documents (List[str]): A list of the document contents as strings
+#             filepaths (List[Path]): Corresponding document filepaths
+#         Returns:
+#             DefaultDict: Dictionary with either ".md" or ".pdf" as a key.
+#                 Markdown items contain document contents, PDF items contain
+#                 paths to documents.
+#         """
+#         doc_dict = defaultdict(list)
+#         for doc, path in zip(documents, filepaths):
+#             if path.suffix == ".md":
+#                 # append doc contents
+#                 doc_dict[SupportedFileTypes.MD].append((doc, path))
+#             elif path.suffix == ".pdf":
+#                 # append doc paths
+#                 doc_dict[SupportedFileTypes.PDF].append((doc, path))
+#             else:
+#                 raise ValueError(
+#                     f"Received document of type .{path.suffix}, which is not a supported filetype"
+#                 )
+#         return doc_dict
+
+
+# class TextSplitChunker(ChunkerBase):
+#     def __init__(
+#         self,
+#         document_contents: List | str,
+#         server_ctx_size: int,
+#         chunk_word_count: int,
+#         output_dir: Path,
+#     ):
+#         self.document_contents = document_contents
+#         self.server_ctx_size = server_ctx_size
+#         self.chunk_word_count = chunk_word_count
+#         self.output_dir = output_dir
+
+#     def chunk_documents(self) -> List:
+#         """Naively chunk markdown documents based on the word count provided by the user.
+#         Returns:
+#             List[str]: List of chunked documents.
+#         """
+#         num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count)
+#         if num_tokens_per_doc > int(self.server_ctx_size - 1024):
+#             raise ValueError(
+#                 "Error: {}".format(
+#                     str(
+#                         f"Given word count ({self.chunk_word_count}) per doc will exceed the server context window size ({self.server_ctx_size})"
+#                     )
+#                 )
+#             )
+#         if self.document_contents == []:
+#             return []
+
+#         chunk_size = _num_chars_from_tokens(num_tokens_per_doc)
+#         return chunk_markdowns(self.document_contents, chunk_size)
+
+
+class DocumentChunker():  # pylint: disable=too-many-instance-attributes
+
+    # def __new__(
+    #     cls,
+    #     leaf_node,
+    #     taxonomy_path,
+    #     output_dir: Path,
+    #     server_ctx_size=4096,
+    #     chunk_word_count=1024,
+    #     tokenizer_model_name: Optional[str] = None,
+    #     docling_model_path: Optional[str] = None,
+    # ):
 
-class TextSplitChunker(ChunkerBase):
     def __init__(
         self,
-        document_contents: List | str,
-        server_ctx_size: int,
-        chunk_word_count: int,
+        document_paths: List[Path],
         output_dir: Path,
+        tokenizer_model_name: str,
+        docling_model_path: Optional[Path] = None,
+        server_ctx_size: int = 4096,
+        chunk_word_count: int = 1024,
     ):
-        self.document_contents = document_contents
-        self.server_ctx_size = server_ctx_size
-        self.chunk_word_count = chunk_word_count
-        self.output_dir = output_dir
+        if len(document_paths) == 0:
+            raise ValueError("Provided empty list of documents")
 
-    def chunk_documents(self) -> List:
-        """Naively chunk markdown documents based on the word count provided by the user.
-        Returns:
-            List[str]: List of chunked documents.
-        """
-        num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count)
-        if num_tokens_per_doc > int(self.server_ctx_size - 1024):
-            raise ValueError(
-                "Error: {}".format(
-                    str(
-                        f"Given word count ({self.chunk_word_count}) per doc will exceed the server context window size ({self.server_ctx_size})"
-                    )
-                )
-            )
-        if self.document_contents == []:
-            return []
+        document_dict = split_docs_by_filetype(document_paths)
 
-        chunk_size = _num_chars_from_tokens(num_tokens_per_doc)
-        return chunk_markdowns(self.document_contents, chunk_size)
+        if len(document_dict) > 1:
+            raise ValueError("Provided multiple document types")
 
+        # We know there is only 1 key, value pair, so we take the first
+        self.document_filetype, self.document_paths = next(iter(document_dict.items()))
+        self.docling_model_path = docling_model_path
+        self.converter = self._init_docling_converter()
 
-class ContextAwareChunker(ChunkerBase):  # pylint: disable=too-many-instance-attributes
-    def __init__(
-        self,
-        document_paths,
-        filepaths,
-        output_dir: Path,
-        chunk_word_count: int,
-        tokenizer_model_name: Optional[str],
-        docling_model_path=None,
-    ):
-        self.document_paths = document_paths
-        self.filepaths = filepaths
         self.output_dir = self._path_validator(output_dir)
+        self.server_ctx_size = server_ctx_size
         self.chunk_word_count = chunk_word_count
         self.tokenizer = self.create_tokenizer(tokenizer_model_name)
-        self.docling_model_path = docling_model_path
-
-    def chunk_documents(self) -> List:
-        """Semantically chunk PDF documents.
 
-        Returns:
-            List: a list of chunks from the documents
+    def _init_docling_converter(self):
+        """Initialize docling converter with filetype-specific configurations
         """
         # triggers torch loading, import lazily
         # pylint: disable=import-outside-toplevel
         # Third Party
-        from docling.document_converter import DocumentConverter, PdfFormatOption
         from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
-
-        if self.document_paths == []:
-            return []
+        from docling.document_converter import DocumentConverter, PdfFormatOption
 
         if self.docling_model_path is None:
             logger.info("Docling models not found on disk, downloading models...")
@@ -258,17 +285,29 @@ def chunk_documents(self) -> List:
             artifacts_path=self.docling_model_path,
             do_ocr=False,
         )
+        
         ocr_options = resolve_ocr_options()
         if ocr_options is not None:
             pipeline_options.do_ocr = True
             pipeline_options.ocr_options = ocr_options
 
-        converter = DocumentConverter(
+        return DocumentConverter(
             format_options={
                 InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
             }
         )
-        parsed_documents = converter.convert_all(self.filepaths)
+
+    def chunk_documents(self) -> List:
+        """Split a list of documents into chunks
+
+        Returns:
+            List: a list of chunks from the documents
+        """
+
+        if self.document_paths == []:
+            return []
+
+        parsed_documents = self.converter.convert_all(self.document_paths)
 
         docling_artifacts_path = self.export_documents(parsed_documents)
 
@@ -348,7 +387,7 @@ def fuse_texts(
         return fused_texts
 
     @staticmethod
-    def create_tokenizer(model_name: Optional[str]):
+    def create_tokenizer(model_name: str):
         """
         Create a tokenizer instance from a pre-trained model or a local directory.
 
@@ -363,10 +402,7 @@ def create_tokenizer(model_name: Optional[str]):
         # Third Party
         from transformers import AutoTokenizer
 
-        if model_name is None:
-            raise TypeError("No model path provided")
-
-        model_path = Path(model_name)
+        model_path = Path(model_name)  # TODO expect a path from the DocumentChunker constructor
         error_info_message = (
             "Please run `ilab model download {download_args}` and try again"
         )
@@ -486,7 +522,7 @@ def get_table_page_number(self, json_book, idx):
                 prev_page_num = book_element["prov"][0]["page"]
                 break
         for book_element in json_book["main-text"][idx:]:
-            if "prov" in book_element:
+            if "prov" in book_element and book_element["prov"]:
                 next_page_num = book_element["prov"][0]["page"]
                 break
         if prev_page_num is not None and next_page_num is not None:
@@ -545,8 +581,12 @@ def build_chunks_from_docling_json(
                     current_book_page_number = self.get_table_page_number(
                         json_book, idx
                     )
+                    book_text = self.get_table(json_book, book_element["$ref"])
+                elif book_element["prov"]:
+                    current_book_page_number = book_element["prov"][0]["page"]  # TODO export to function to handle empty ["prov"]
+                    book_text = book_element["text"]
                 else:
-                    current_book_page_number = book_element["prov"][0]["page"]
+                    current_book_page_number = None
                     book_text = book_element["text"]
 
                 if book_element["type"] == "subtitle-level-1":
@@ -599,8 +639,6 @@ def build_chunks_from_docling_json(
 
                 if book_element["type"] == "paragraph":
                     book_text = self.add_heading_formatting(book_text)
-                elif book_element["type"] == "table":
-                    book_text = self.get_table(json_book, book_element["$ref"])
                 if "## References" in book_text or "## Acknowledgements" in book_text:
                     # For research papers we ignore everything after this sections
                     break
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index 24592fe1..db66f1ae 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -425,13 +425,13 @@ def _knowledge_leaf_node_to_samples(
     model_name,
     docling_model_path=None,
 ):
+    document_paths = leaf_node[0]["filepaths"]
     chunker = DocumentChunker(
-        leaf_node=leaf_node,
-        taxonomy_path=taxonomy_path,
+        document_paths=document_paths,
         output_dir=document_output_dir,
+        tokenizer_model_name=model_name,
         server_ctx_size=server_ctx_size,
         chunk_word_count=chunk_word_count,
-        tokenizer_model_name=model_name,
         docling_model_path=docling_model_path,
     )
     chunks = chunker.chunk_documents()

From feff382e8e8ca6462118b001ce93831934cddd68 Mon Sep 17 00:00:00 2001
From: Khaled Sulayman <ksulayma@redhat.com>
Date: Thu, 5 Dec 2024 17:05:57 -0500
Subject: [PATCH 02/11] Modify tests to use new DocumentChunker interface

Signed-off-by: Khaled Sulayman <ksulayma@redhat.com>
---
 tests/functional/test_chunkers.py | 10 +++---
 tests/test_chunkers.py            | 59 +++++--------------------------
 tests/test_generate_data.py       | 12 +++----
 3 files changed, 18 insertions(+), 63 deletions(-)

diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
index 217cb889..d364ac48 100644
--- a/tests/functional/test_chunkers.py
+++ b/tests/functional/test_chunkers.py
@@ -26,12 +26,11 @@ def test_chunk_pdf(tmp_path, tokenizer_model_name):
         }
     ]
     chunker = DocumentChunker(
-        leaf_node=leaf_node,
-        taxonomy_path=tmp_path,
+        document_paths=[pdf_path],
         output_dir=tmp_path,
+        tokenizer_model_name=tokenizer_model_name,
         server_ctx_size=4096,
         chunk_word_count=500,
-        tokenizer_model_name=tokenizer_model_name,
     )
     chunks = chunker.chunk_documents()
     assert len(chunks) > 9
@@ -51,12 +50,11 @@ def test_chunk_md(tmp_path, tokenizer_model_name):
         }
     ]
     chunker = DocumentChunker(
-        leaf_node=leaf_node,
-        taxonomy_path=tmp_path,
+        document_paths=[markdown_path],
         output_dir=tmp_path,
+        tokenizer_model_name=tokenizer_model_name,
         server_ctx_size=4096,
         chunk_word_count=500,
-        tokenizer_model_name=tokenizer_model_name,
     )
     chunks = chunker.chunk_documents()
     assert len(chunks) > 7
diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py
index 700758ae..ff1079d5 100644
--- a/tests/test_chunkers.py
+++ b/tests/test_chunkers.py
@@ -12,10 +12,7 @@
 
 # First Party
 from instructlab.sdg.utils.chunkers import (
-    ContextAwareChunker,
     DocumentChunker,
-    FileTypes,
-    TextSplitChunker,
     resolve_ocr_options,
 )
 
@@ -35,65 +32,25 @@ def tokenizer_model_name():
     return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab")
 
 
-@pytest.mark.parametrize(
-    "filepaths, chunker_type",
-    [
-        ([Path("document.md")], TextSplitChunker),
-        ([Path("document.pdf")], ContextAwareChunker),
-    ],
-)
-def test_chunker_factory(filepaths, chunker_type, documents_dir, tokenizer_model_name):
-    """Test that the DocumentChunker factory class returns the proper Chunker type"""
-    leaf_node = [
-        {
-            "documents": ["Lorem ipsum"],
-            "taxonomy_path": "",
-            "filepaths": filepaths,
-        }
-    ]
-    with tempfile.TemporaryDirectory() as temp_dir:
-        chunker = DocumentChunker(
-            leaf_node=leaf_node,
-            taxonomy_path=documents_dir,
-            output_dir=temp_dir,
-            tokenizer_model_name=tokenizer_model_name,
-        )
-        assert isinstance(chunker, chunker_type)
-
-
-def test_chunker_factory_unsupported_filetype(documents_dir, tokenizer_model_name):
+def test_init_document_chunker_unsupported_filetype(documents_dir, tokenizer_model_name):
     """Test that the DocumentChunker factory class fails when provided an unsupported document"""
-    leaf_node = [
-        {
-            "documents": ["Lorem ipsum"],
-            "taxonomy_path": "",
-            "filepaths": [Path("document.jpg")],
-        }
-    ]
+    document_paths = [documents_dir / "document.jpg"]
     with pytest.raises(ValueError):
         with tempfile.TemporaryDirectory() as temp_dir:
             _ = DocumentChunker(
-                leaf_node=leaf_node,
-                taxonomy_path=documents_dir,
+                document_paths=document_paths,
                 output_dir=temp_dir,
                 tokenizer_model_name=tokenizer_model_name,
             )
 
 
-def test_chunker_factory_empty_filetype(documents_dir):
+def test_chunker_factory_empty_document_paths(tokenizer_model_name):
     """Test that the DocumentChunker factory class fails when provided no document"""
-    leaf_node = [
-        {
-            "documents": [],
-            "taxonomy_path": "",
-            "filepaths": [],
-        }
-    ]
+    document_paths = []
     with pytest.raises(ValueError):
         with tempfile.TemporaryDirectory() as temp_dir:
             _ = DocumentChunker(
-                leaf_node=leaf_node,
-                taxonomy_path=documents_dir,
+                document_paths=document_paths,
                 output_dir=temp_dir,
                 tokenizer_model_name=tokenizer_model_name,
             )
@@ -149,7 +106,7 @@ def test_resolve_ocr_options_none_found_logs_error(
 
 
 def test_create_tokenizer(tokenizer_model_name):
-    ContextAwareChunker.create_tokenizer(tokenizer_model_name)
+    DocumentChunker.create_tokenizer(tokenizer_model_name)
 
 
 @pytest.mark.parametrize(
@@ -163,4 +120,4 @@ def test_create_tokenizer(tokenizer_model_name):
 def test_invalid_tokenizer(model_name):
     model_path = os.path.join(TEST_DATA_DIR, model_name)
     with pytest.raises(ValueError):
-        ContextAwareChunker.create_tokenizer(model_path)
+        DocumentChunker.create_tokenizer(model_path)
diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py
index c5415636..650df21a 100644
--- a/tests/test_generate_data.py
+++ b/tests/test_generate_data.py
@@ -312,8 +312,8 @@ def test_generate(self):
             generate_data(
                 client=MagicMock(),
                 logger=mocked_logger,
-                model_family="merlinite",
-                model_name="models/merlinite-7b-lab-Q4_K_M.gguf",
+                model_family="granite",
+                model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"),
                 num_instructions_to_generate=10,
                 taxonomy=self.test_taxonomy.root,
                 taxonomy_base=TEST_TAXONOMY_BASE,
@@ -395,8 +395,8 @@ def test_generate(self):
             generate_data(
                 client=MagicMock(),
                 logger=mocked_logger,
-                model_family="merlinite",
-                model_name="models/merlinite-7b-lab-Q4_K_M.gguf",
+                model_family="granite",
+                model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"),
                 num_instructions_to_generate=10,
                 taxonomy=self.test_taxonomy.root,
                 taxonomy_base=TEST_TAXONOMY_BASE,
@@ -499,8 +499,8 @@ def test_generate(self):
             generate_data(
                 client=MagicMock(),
                 logger=mocked_logger,
-                model_family="merlinite",
-                model_name="models/merlinite-7b-lab-Q4_K_M.gguf",
+                model_family="granite",
+                model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"),
                 num_instructions_to_generate=10,
                 taxonomy=self.test_taxonomy.root,
                 taxonomy_base=TEST_TAXONOMY_BASE,

From 759a04d1bea2959086d60fd5955eb955d2b3dd65 Mon Sep 17 00:00:00 2001
From: Khaled Sulayman <ksulayma@redhat.com>
Date: Tue, 17 Dec 2024 16:48:47 +0400
Subject: [PATCH 03/11] Check for html in markdown files and error out

Signed-off-by: Khaled Sulayman <ksulayma@redhat.com>
---
 src/instructlab/sdg/utils/taxonomy.py | 18 ++++++++++++++++++
 tests/test_taxonomy.py                | 12 ++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index db66f1ae..f965e1d9 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -110,6 +110,18 @@ def _get_taxonomy(repo="taxonomy"):
     return taxonomy_file_paths
 
 
+def _string_contains_html(s: str) -> bool:
+    """Detect HTML tags in a string.
+    
+    We use this to catch markdown files that may contain html elements since
+    docling does not support this."""
+    # Define a regex to detect HTML tags
+    html_tag_pattern = re.compile(r"<\/?[a-zA-Z][\s\S]*?>")
+    
+    # Check for HTML tags in the content
+    return bool(html_tag_pattern.search(s))
+
+
 def _get_documents(
     source: Dict[str, Union[str, List[str]]],
     skip_checkout: bool = False,
@@ -161,6 +173,12 @@ def _get_documents(
                             # Process Markdown files
                             with open(file_path, "r", encoding="utf-8") as file:
                                 content = file.read()
+                                if _string_contains_html(content):
+                                    raise ValueError(f"Provided markdown file {file_path} contains"
+                                                     " HTML, which is currently unsupported. Please"
+                                                     " format your markdown documents without the"
+                                                     " use of HTML or use a different document"
+                                                     " filetype.")
                                 file_contents.append(content)
                                 filepaths.append(Path(file_path))
                                 logger.info(
diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py
index 0828e187..32114fcc 100644
--- a/tests/test_taxonomy.py
+++ b/tests/test_taxonomy.py
@@ -11,6 +11,7 @@
 
 # First Party
 from instructlab.sdg.utils import taxonomy
+from instructlab.sdg.utils.taxonomy import _string_contains_html
 
 TEST_SEED_EXAMPLE = "Can you help me debug this failing unit test?"
 
@@ -85,3 +86,14 @@ def test_read_taxonomy_leaf_nodes(
             ):
                 seed_example_exists = True
             assert seed_example_exists is True
+    
+    @pytest.mark.parametrize(
+        "s, contains_html",
+        [
+            ("hello, world!", False),
+            ("hello, <div>world!</div>", True),
+        ]
+    )
+    def test_string_contains_html(self, s, contains_html):
+        print(taxonomy.__dict__)
+        assert _string_contains_html(s) == contains_html

From 8af10d4c4e5b054c94405955b0255eaeeef1eca2 Mon Sep 17 00:00:00 2001
From: Khaled Sulayman <ksulayma@redhat.com>
Date: Wed, 18 Dec 2024 16:42:46 +0400
Subject: [PATCH 04/11] replace test_valid_knowledge_skill.yaml with example
 with no html

Signed-off-by: Khaled Sulayman <ksulayma@redhat.com>
---
 .../testdata/test_valid_knowledge_skill.yaml  | 322 ++++++++++--------
 1 file changed, 172 insertions(+), 150 deletions(-)

diff --git a/tests/testdata/test_valid_knowledge_skill.yaml b/tests/testdata/test_valid_knowledge_skill.yaml
index 705acb41..be4c3ef9 100644
--- a/tests/testdata/test_valid_knowledge_skill.yaml
+++ b/tests/testdata/test_valid_knowledge_skill.yaml
@@ -1,176 +1,198 @@
-created_by: lukeinglis
-domain: anatomy_tonsil
 version: 3
+domain: astronomy
+created_by: juliadenham
 seed_examples:
   - context: |
-      ## Structure
-      Humans are born with four types of tonsils: the pharyngeal tonsil, two
-      tubal tonsils, two palatine tonsils, and the lingual tonsils.[1]
-
-      <table>
-      <thead>
-      <tr class="header">
-      <th><p>Type</p></th>
-      <th><p><a href="Epithelium" title="wikilink">Epithelium</a></p></th>
-      <th><p><a href=":wikt:capsule" title="wikilink">Capsule</a></p></th>
-      <th><p><a href="Tonsillar_crypts" title="wikilink">Crypts</a></p></th>
-      <th><p>Location</p></th>
-      </tr>
-      </thead>
-      <tbody>
-      <tr class="odd">
-      <td><p><a href="Adenoid" title="wikilink">Pharyngeal tonsil</a> (also
-      termed "adenoid")</p></td>
-      <td><p><a href="pseudostratified_epithelium" title="wikilink">Ciliated
-      pseudostratified columnar</a> (<a href="respiratory_epithelium"
-      title="wikilink">respiratory epithelium</a>)</p></td>
-      <td><p>Incompletely encapsulated</p></td>
-      <td><p>Small folds—sometimes described as crypts<a href="#fn1"
-      class="footnote-ref" id="fnref1"
-      role="doc-noteref"><sup>1</sup></a></p></td>
-      <td><p>Roof of <a href="pharynx" title="wikilink">pharynx</a></p></td>
-      </tr>
-      <tr class="even">
-      <td><p><a href="Tubal_tonsils" title="wikilink">Tubal tonsils</a></p></td>
-      <td><p>Ciliated pseudostratified columnar (respiratory epithelium)</p></td>
-      <td><p>Not encapsulated</p></td>
-      <td><p>No crypts</p></td>
-      <td><p>Roof of pharynx</p></td>
-      </tr>
-      <tr class="odd">
-      <td><p><a href="Palatine_tonsils" title="wikilink">Palatine tonsils</a></p></td>
-      <td><p>Stratified squamous epithelium</p></td>
-      <td><p>Fully encapsulated</p></td>
-      <td><p>Multiple deep crypts</p></td>
-      <td><p>Each side of the throat at the back of the mouth</p></td>
-      </tr>
-
+      **Phoenix** is a minor [constellation](constellation "wikilink") in the
+      [southern sky](southern_sky "wikilink"). Named after the mythical
+      [phoenix](Phoenix_(mythology) "wikilink"), it was first depicted on a
+      celestial atlas by [Johann Bayer](Johann_Bayer "wikilink") in his 1603
+      *[Uranometria](Uranometria "wikilink")*. The French explorer and
+      astronomer [Nicolas Louis de
+      Lacaille](Nicolas_Louis_de_Lacaille "wikilink") charted the brighter
+      stars and gave their [Bayer designations](Bayer_designation "wikilink")
+      in 1756. The constellation stretches from roughly −39 degrees to −57 degrees
+      [declination](declination "wikilink"), and from 23.5h to 2.5h of [right
+      ascension](right_ascension "wikilink"). The constellations Phoenix,
+      [Grus](Grus_(constellation) "wikilink"),
+      [Pavo](Pavo_(constellation) "wikilink") and [Tucana](Tucana "wikilink"),
+      are known as the Southern Birds.
     questions_and_answers:
-      - question: What is the location of the tubal tonsils?
-        answer: The location of the tubal tonsils is the roof of the pharynx.
       - question: |
-          Compare the epithelial types, encapsulation, and presence of
-          crypts in the pharyngeal, tubal, and palatine tonsils according to the
-          table provided.
+          What is the Phoenix constellation?
         answer: |
-          The pharyngeal tonsil features ciliated pseudostratified columnar
-          epithelium and is incompletely encapsulated with small folds sometimes
-          described as crypts. The tubal tonsils also have ciliated
-          pseudostratified columnar epithelium but are not encapsulated and do
-          not possess crypts. In contrast, the palatine tonsils are covered with
-          stratified squamous epithelium, are fully encapsulated, and contain
-          multiple deep crypts. These structural differences are indicative of
-          their varied anatomical locations and potentially their distinct
-          functions within the immune system.
-      - question: What type of epithelium is found in the pharyngeal tonsil?
+          Phoenix is a minor constellation in the southern sky.
+      - question: |
+          Who charted the Phoenix constellation?
+        answer: |
+          The Phoenix constellation was charted by french explorer and
+          astronomer Nicolas Louis de Lacaille.
+      - question: |
+          How far does the Phoenix constellation stretch?
         answer: |
-          The type of epithelium found in the pharyngeal tonsil is ciliated
-          pseudostratified columnar (respiratory epithelium).
-
-
+          The phoenix constellation stretches from roughly −39° to −57°
+          declination, and from 23.5h to 2.5h of right ascension.
   - context: |
-      The **tonsils** are a set of [lymphoid](Lymphatic_system "wikilink")
-      organs facing into the aerodigestive tract, which is known as
-      [Waldeyer's tonsillar ring](Waldeyer's_tonsillar_ring "wikilink") and
-      consists of the [adenoid tonsil](adenoid "wikilink") (or pharyngeal
-      tonsil), two [tubal tonsils](tubal_tonsil "wikilink"), two [palatine
-      tonsils](palatine_tonsil "wikilink"), and the [lingual
-      tonsils](lingual_tonsil "wikilink"). These organs play an important role
-      in the immune system.
-
+      Phoenix was the largest of the 12 constellations established by [Petrus
+      Plancius](Petrus_Plancius "wikilink") from the observations of [Pieter
+      Dirkszoon Keyser](Pieter_Dirkszoon_Keyser "wikilink") and [Frederick de
+      Houtman](Frederick_de_Houtman "wikilink"). It first appeared on a 35cm
+      diameter celestial globe published in 1597 (or 1598) in Amsterdam by
+      Plancius with [Jodocus Hondius](Jodocus_Hondius "wikilink"). The first
+      depiction of this constellation in a celestial atlas was in [Johann
+      Bayer](Johann_Bayer "wikilink")'s
+      *[Uranometria](Uranometria "wikilink")* of 1603. De Houtman included
+      it in his southern star catalog the same year under the Dutch name *Den
+      voghel Fenicx*, "The Bird Phoenix", symbolising the
+      [phoenix](Phoenix_(mythology) "wikilink") of classical mythology. One
+      name of the brightest star [Alpha
+      Phoenicis](Alpha_Phoenicis "wikilink")—Ankaa—is derived from the Arabic:
+      العنقاء, romanized: al-‘anqā’, lit. 'the phoenix', and
+      was coined sometime after 1800 in relation to the constellation.
     questions_and_answers:
-      - question: What is the immune system's first line of defense?
+      - question: |
+          What is the brightest star in the Phoenix constellation
+          called?
         answer: |
-          The tonsils are the immune system's first line of defense against
-          ingested or inhaled foreign pathogens.
-      - question: What is Waldeyer's tonsillar ring?
+          Alpha Phoenicis or Ankaa is the brightest star in the Phoenix
+          Constellation.
+      - question: Where did the Phoenix constellation first appear?
         answer: |
-          Waldeyer's tonsillar ring is a set of lymphoid organs facing into the
-          aerodigestive tract, consisting of the adenoid tonsil, two tubal
-          tonsils, two palatine tonsils, and the lingual tonsils.
-      - question: How many tubal tonsils are part of Waldeyer's tonsillar ring?
-        answer: There are two tubal tonsils as part of Waldeyer's tonsillar ring.
-
+          The Phoenix constellation first appeared on a 35-cm diameter
+          celestial globe published in 1597 (or 1598) in Amsterdam by
+          Plancius with Jodocus Hondius.
+      - question: |
+          What does "The Bird Phoenix" symbolize?
+        answer: |
+          "The Bird Phoenix" symbolizes the phoenix of classical mythology.
   - context: |
-      The palatine tonsils tend to reach their largest size in [puberty](puberty
-      "wikilink"), and they gradually undergo [atrophy](atrophy "wikilink")
-      thereafter. However, they are largest relative to the diameter of the
-      throat in young children. In adults, each palatine tonsil normally
-      measures up to 2.5 cm in length, 2.0 cm in width and 1.2 cm in
-      thickness.[2]
-
+      Phoenix is a small constellation bordered by [Fornax](Fornax "wikilink")
+      and Sculptor to the north, Grus to the west, Tucana to the south,
+      touching on the corner of [Hydrus](Hydrus "wikilink") to the south, and
+      [Eridanus](Eridanus_(constellation) "wikilink") to the east and
+      southeast. The bright star [Achernar](Achernar "wikilink") is
+      nearby. The three-letter abbreviation for the constellation, as
+      adopted by the [International Astronomical
+      Union](International_Astronomical_Union "wikilink") in 1922, is
+      "Phe". The official constellation boundaries, as set by Belgian
+      astronomer [Eugène Delporte](Eugène_Joseph_Delporte "wikilink") in 1930,
+      are defined by a polygon of 10 segments. In the [equatorial coordinate
+      system](equatorial_coordinate_system "wikilink"), the [right
+      ascension](right_ascension "wikilink") coordinates of these borders lie
+      between 23<sup>h</sup> 26.5<sup>m</sup> and 02<sup>h</sup> 25.0<sup>m</sup>,
+      while the [declination](declination "wikilink")
+      coordinates are between −39.31° and −57.84°. This means it remains
+      below the horizon to anyone living north of the [40th
+      parallel](40th_parallel_north "wikilink") in the [Northern
+      Hemisphere](Northern_Hemisphere "wikilink"), and remains low in the sky
+      for anyone living north of the [equator](equator "wikilink"). It is most
+      visible from locations such as Australia and South Africa during late
+      [Southern Hemisphere](Southern_Hemisphere "wikilink") spring. Most
+      of the constellation lies within, and can be located by, forming a
+      triangle of the bright stars Achernar, [Fomalhaut](Fomalhaut "wikilink")
+      and [Beta Ceti](Beta_Ceti "wikilink")—Ankaa lies roughly in the centre
+      of this.
     questions_and_answers:
-      - question: When do the palatine tonsils tend to reach their largest size?
-        answer: The palatine tonsils tend to reach their largest size in puberty.
-      - question: What are the typical dimensions of an adult palatine tonsil?
+      - question: What are the characteristics of the Phoenix constellation?
+        answer: |
+          Phoenix is a small constellation bordered by Fornax and Sculptor to
+          the north, Grus to the west, Tucana to the south, touching on the
+          corner of Hydrus to the south, and Eridanus to the east and southeast.
+          The bright star Achernar is nearby.
+      - question: |
+          When is the phoenix constellation most visible?
         answer: |
-          In adults, each palatine tonsil normally measures up to 2.5 cm in
-          length, 2.0 cm in width, and 1.2 cm in thickness.
-      - question: How do the palatine tonsils change in size with age?
+          Phoenix is most visible from locations such as Australia and
+          South Africa during late Southern Hemisphere spring.
+      - question: |
+          What are the Phoenix Constellation boundaries?
         answer: |
-          The palatine tonsils tend to gradually undergo atrophy after puberty,
-          becoming smaller in size compared to their dimensions in young
-          children.
-
+          The official constellation boundaries for Phoenix, as set by Belgian
+          astronomer Eugène Delporte in 1930, are defined by a polygon of 10
+          segments.
   - context: |
-      The tonsils are immunocompetent organs that serve as the immune system's
-      first line of defense against ingested or inhaled foreign pathogens, and
-      as such frequently engorge with blood to assist in immune responses to
-      common illnesses such as the common cold. The tonsils have on their
-      surface specialized antigen capture cells called [microfold
-      cells](microfold_cell "wikilink") (M cells) that allow for the uptake of
-      antigens produced by pathogens. These M cells then alert the B cells and T
-      cells in the tonsil that a pathogen is present and an immune response is
-      stimulated.[3] B cells are activated and proliferate in areas called
-      germinal centers in the tonsil. These germinal centers are places where B
-      memory cells are created and [secretory antibody (IgA)](Immunoglobulin_A
-      "wikilink") is produced.
-
+      Ten stars have been found to have planets to date, and four planetary
+      systems have been discovered with the [SuperWASP](SuperWASP "wikilink")
+      project. [HD 142](HD_142 "wikilink") is a yellow giant that has an
+      apparent magnitude of 5.7, and has a planet ([HD 142b](HD_142_b
+      "wikilink")) 1.36 times the mass of Jupiter which orbits every 328 days.
+      [HD 2039](HD_2039 "wikilink") is a yellow subgiant with an apparent
+      magnitude of 9.0 around 330 light years away which has a planet ([HD 2039
+      b](HD_2039_b "wikilink")) six times the mass of Jupiter. [WASP-18](WASP-18
+      "wikilink") is a star of magnitude 9.29 which was discovered to have a hot
+      Jupiter-like planet ([WASP-18b](WASP-18b "wikilink")) taking less than a
+      day to orbit the star. The planet is suspected to be causing WASP-18 to
+      appear older than it really is. [WASP-4](WASP-4 "wikilink") and
+      [WASP-5](WASP-5 "wikilink") are solar-type yellow stars around 1000
+      light years distant and of 13th magnitude, each with a single planet
+      larger than Jupiter. [WASP-29](WASP-29 "wikilink") is an orange
+      dwarf of spectral type K4V and visual magnitude 11.3, which has a
+      planetary companion of similar size and mass to Saturn. The planet
+      completes an orbit every 3.9 days.
     questions_and_answers:
-      - question: |
-          What are the specialized antigen capture cells on the surface of the
-          tonsils called?
+      - question: In the Phoenix constellation, how many stars have planets?
         answer: |
-          The specialized antigen capture cells on the surface of the tonsils
-          are called microfold cells (M cells).
-      - question: What is the role of microfold cells in the tonsils?
+          In the Phoenix constellation, ten stars have been found to have
+          planets to date, and four planetary systems have been discovered
+          with the SuperWASP project.
+      - question: What is HD 142?
+        answer: |
+          HD 142 is a yellow giant that has an apparent magnitude of 5.7, and
+          has a planet (HD 142 b) 1.36 times the mass of Jupiter which
+          orbits every 328 days.
+      - question: |
+          Are WASP-4 and WASP-5 solar-type yellow stars?
         answer: |
-          Microfold cells (M cells) allow for the uptake of antigens produced by
-          pathogens. They alert the B cells and T cells in the tonsil that a
-          pathogen is present, stimulating an immune response.
-      - question: Where do B cells proliferate in the tonsils?
-        answer: B cells proliferate in areas called germinal centers in the tonsils.
-
+          Yes, WASP-4 and WASP-5 are solar-type yellow stars around 1000 light
+          years distant and of 13th magnitude, each with a single planet
+          larger than Jupiter.
   - context: |
-      A [tonsillolith](tonsillolith "wikilink") (also known as a "tonsil stone")
-      is material that accumulates on the palatine tonsil. This can reach the
-      size of a [peppercorn](peppercorn "wikilink") and is white or cream in
-      color. The main substance is mostly [calcium](calcium "wikilink"), but it
-      has a strong unpleasant odor because of [hydrogen
-      sulfide](hydrogen_sulfide "wikilink") and [methyl
-      mercaptan](methyl_mercaptan "wikilink") and other chemicals.[6]
-
+      The constellation does not lie on the
+      [galactic plane](galactic_plane "wikilink") of the Milky Way, and there
+      are no prominent star clusters. [NGC 625](NGC_625 "wikilink") is a dwarf
+      [irregular galaxy](irregular_galaxy "wikilink") of apparent magnitude 11.0
+      and lying some 12.7 million light years distant. Only 24000 light years in
+      diameter, it is an outlying member of the [Sculptor Group](Sculptor_Group
+      "wikilink"). NGC 625 is thought to have been involved in a collision and
+      is experiencing a burst of [active star formation](Active_galactic_nucleus
+      "wikilink"). [NGC 37](NGC_37 "wikilink") is a
+      [lenticular galaxy](lenticular_galaxy "wikilink") of apparent magnitude
+      14.66. It is approximately 42 [kiloparsecs](kiloparsecs "wikilink")
+      (137,000 [light-years](light-years "wikilink")) in diameter and about
+      12.9 billion years old. [Robert's Quartet](Robert's_Quartet "wikilink")
+      (composed of the irregular galaxy [NGC 87](NGC_87 "wikilink"), and three
+      spiral galaxies [NGC 88](NGC_88 "wikilink"), [NGC 89](NGC_89 "wikilink")
+      and [NGC 92](NGC_92 "wikilink")) is a group of four galaxies located
+      around 160 million light-years away which are in the process of colliding
+      and merging. They are within a circle of radius of 1.6 arcmin,
+      corresponding to about 75,000 light-years. Located in the galaxy ESO
+      243-49 is [HLX-1](HLX-1 "wikilink"), an
+      [intermediate-mass black hole](intermediate-mass_black_hole
+      "wikilink")—the first one of its kind identified. It is thought to be a
+      remnant of a dwarf galaxy that was absorbed in a
+      [collision](Interacting_galaxy "wikilink") with ESO 243-49. Before its
+      discovery, this class of black hole was only hypothesized.
     questions_and_answers:
-      - question: What is a tonsillolith?
+      - question: |
+          Is the Phoenix Constellation part of the Milky Way?
+        answer: |
+          The Phoenix constellation does not lie on the galactic plane of
+          the Milky Way, and there are no prominent star clusters.
+      - question: |
+          How many light years away is NGC 625?
         answer: |
-          A tonsillolith (tonsil stone) is material that accumulates on the
-          palatine tonsil, reaching the size of a peppercorn and having a white
-          or cream color. It contains calcium and has a strong unpleasant odor
-          due to hydrogen sulfide, methyl mercaptan, and other chemicals.
-      - question: What is the main substance found in a tonsillolith?
-        answer: The main substance found in a tonsillolith is mostly calcium.
-      - question: Why do tonsilloliths have a strong unpleasant odor?
+          NGC 625 is 24000 light years in diameter and is an outlying
+          member of the Sculptor Group.
+      - question: |
+          What is Robert's Quartet composed of?
         answer: |
-          Tonsilloliths have a strong unpleasant odor due to hydrogen sulfide,
-          methyl mercaptan, and other chemicals.
-
+          Robert's Quartet is composed of the irregular galaxy NGC 87,
+          and three spiral galaxies NGC 88, NGC 89 and NGC 92.
 document_outline: |
-  Overview of Human tonsils, describing their types, locations, structure,
-  function, and clinical significance, with a specific focus on their role in
-  the immune system and related health issues.
-
+  Information about the Phoenix Constellation including the
+  history, characteristics, and features of the stars in the constellation.
 document:
-  repo: https://github.com/luke-inglis/il-anatomy-knowledge
-  commit: cc7c6ca
+  repo: https://github.com/RedHatOfficial/rhelai-taxonomy-data
+  commit: c87a82eb15567f28c0a8d30025e0cd77a2150646
   patterns:
-    - anatomy1.md
+    - phoenix.md

From de8180b60955dfa26aef7d93e4478f68a1d7d414 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Mon, 6 Jan 2025 13:28:22 -0500
Subject: [PATCH 05/11] Update for ruff and pylint issues

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 26 ++++++++++++--------------
 src/instructlab/sdg/utils/taxonomy.py | 18 ++++++++++--------
 tests/test_chunkers.py                |  9 ++++-----
 tests/test_generate_data.py           | 12 +++++++++---
 tests/test_taxonomy.py                |  4 ++--
 5 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index b292239c..257c9cea 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -1,9 +1,7 @@
 # Standard
-from abc import ABC, abstractmethod
 from collections import defaultdict
-from enum import Enum
 from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional
 import json
 import logging
 import re
@@ -70,8 +68,7 @@ def resolve_ocr_options() -> OcrOptions:
 
 
 def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
-    """Split document paths into a dict of lists based on their file extension.
-    """
+    """Split document paths into a dict of lists based on their file extension."""
     document_dict = defaultdict(list)
     for path in document_paths:
         filetype = path.suffix
@@ -83,7 +80,6 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
     return dict(document_dict)
 
 
-
 # class DocumentChunker:
 #     """A factory chunker class that instantiates the applicable chunker
 
@@ -226,8 +222,7 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
 #         return chunk_markdowns(self.document_contents, chunk_size)
 
 
-class DocumentChunker():  # pylint: disable=too-many-instance-attributes
-
+class DocumentChunker:  # pylint: disable=too-many-instance-attributes
     # def __new__(
     #     cls,
     #     leaf_node,
@@ -267,13 +262,12 @@ def __init__(
         self.tokenizer = self.create_tokenizer(tokenizer_model_name)
 
     def _init_docling_converter(self):
-        """Initialize docling converter with filetype-specific configurations
-        """
+        """Initialize docling converter with filetype-specific configurations"""
         # triggers torch loading, import lazily
         # pylint: disable=import-outside-toplevel
         # Third Party
-        from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
         from docling.document_converter import DocumentConverter, PdfFormatOption
+        from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 
         if self.docling_model_path is None:
             logger.info("Docling models not found on disk, downloading models...")
@@ -285,7 +279,7 @@ def _init_docling_converter(self):
             artifacts_path=self.docling_model_path,
             do_ocr=False,
         )
-        
+
         ocr_options = resolve_ocr_options()
         if ocr_options is not None:
             pipeline_options.do_ocr = True
@@ -402,7 +396,9 @@ def create_tokenizer(model_name: str):
         # Third Party
         from transformers import AutoTokenizer
 
-        model_path = Path(model_name)  # TODO expect a path from the DocumentChunker constructor
+        model_path = Path(
+            model_name
+        )  # TODO expect a path from the DocumentChunker constructor
         error_info_message = (
             "Please run `ilab model download {download_args}` and try again"
         )
@@ -583,7 +579,9 @@ def build_chunks_from_docling_json(
                     )
                     book_text = self.get_table(json_book, book_element["$ref"])
                 elif book_element["prov"]:
-                    current_book_page_number = book_element["prov"][0]["page"]  # TODO export to function to handle empty ["prov"]
+                    current_book_page_number = book_element["prov"][0][
+                        "page"
+                    ]  # TODO export to function to handle empty ["prov"]
                     book_text = book_element["text"]
                 else:
                     current_book_page_number = None
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index f965e1d9..8db131d3 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -112,12 +112,12 @@ def _get_taxonomy(repo="taxonomy"):
 
 def _string_contains_html(s: str) -> bool:
     """Detect HTML tags in a string.
-    
+
     We use this to catch markdown files that may contain html elements since
     docling does not support this."""
     # Define a regex to detect HTML tags
     html_tag_pattern = re.compile(r"<\/?[a-zA-Z][\s\S]*?>")
-    
+
     # Check for HTML tags in the content
     return bool(html_tag_pattern.search(s))
 
@@ -174,11 +174,13 @@ def _get_documents(
                             with open(file_path, "r", encoding="utf-8") as file:
                                 content = file.read()
                                 if _string_contains_html(content):
-                                    raise ValueError(f"Provided markdown file {file_path} contains"
-                                                     " HTML, which is currently unsupported. Please"
-                                                     " format your markdown documents without the"
-                                                     " use of HTML or use a different document"
-                                                     " filetype.")
+                                    raise ValueError(
+                                        f"Provided markdown file {file_path} contains"
+                                        " HTML, which is currently unsupported. Please"
+                                        " format your markdown documents without the"
+                                        " use of HTML or use a different document"
+                                        " filetype."
+                                    )
                                 file_contents.append(content)
                                 filepaths.append(Path(file_path))
                                 logger.info(
@@ -436,7 +438,7 @@ def map_chunks_to_icls(chunks: List, leaf_node: Dict) -> Dataset:
 
 def _knowledge_leaf_node_to_samples(
     leaf_node,
-    taxonomy_path,
+    taxonomy_path,  # pylint: disable=unused-argument
     server_ctx_size,
     chunk_word_count,
     document_output_dir,
diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py
index ff1079d5..42db1ac3 100644
--- a/tests/test_chunkers.py
+++ b/tests/test_chunkers.py
@@ -11,10 +11,7 @@
 import pytest
 
 # First Party
-from instructlab.sdg.utils.chunkers import (
-    DocumentChunker,
-    resolve_ocr_options,
-)
+from instructlab.sdg.utils.chunkers import DocumentChunker, resolve_ocr_options
 
 # Local
 from .testdata import testdata
@@ -32,7 +29,9 @@ def tokenizer_model_name():
     return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab")
 
 
-def test_init_document_chunker_unsupported_filetype(documents_dir, tokenizer_model_name):
+def test_init_document_chunker_unsupported_filetype(
+    documents_dir, tokenizer_model_name
+):
     """Test that the DocumentChunker factory class fails when provided an unsupported document"""
     document_paths = [documents_dir / "document.jpg"]
     with pytest.raises(ValueError):
diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py
index 650df21a..c2968029 100644
--- a/tests/test_generate_data.py
+++ b/tests/test_generate_data.py
@@ -313,7 +313,9 @@ def test_generate(self):
                 client=MagicMock(),
                 logger=mocked_logger,
                 model_family="granite",
-                model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"),
+                model_name=os.path.join(
+                    TEST_DATA_DIR, "models/instructlab/granite-7b-lab"
+                ),
                 num_instructions_to_generate=10,
                 taxonomy=self.test_taxonomy.root,
                 taxonomy_base=TEST_TAXONOMY_BASE,
@@ -396,7 +398,9 @@ def test_generate(self):
                 client=MagicMock(),
                 logger=mocked_logger,
                 model_family="granite",
-                model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"),
+                model_name=os.path.join(
+                    TEST_DATA_DIR, "models/instructlab/granite-7b-lab"
+                ),
                 num_instructions_to_generate=10,
                 taxonomy=self.test_taxonomy.root,
                 taxonomy_base=TEST_TAXONOMY_BASE,
@@ -500,7 +504,9 @@ def test_generate(self):
                 client=MagicMock(),
                 logger=mocked_logger,
                 model_family="granite",
-                model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"),
+                model_name=os.path.join(
+                    TEST_DATA_DIR, "models/instructlab/granite-7b-lab"
+                ),
                 num_instructions_to_generate=10,
                 taxonomy=self.test_taxonomy.root,
                 taxonomy_base=TEST_TAXONOMY_BASE,
diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py
index 32114fcc..0b697bd7 100644
--- a/tests/test_taxonomy.py
+++ b/tests/test_taxonomy.py
@@ -86,13 +86,13 @@ def test_read_taxonomy_leaf_nodes(
             ):
                 seed_example_exists = True
             assert seed_example_exists is True
-    
+
     @pytest.mark.parametrize(
         "s, contains_html",
         [
             ("hello, world!", False),
             ("hello, <div>world!</div>", True),
-        ]
+        ],
     )
     def test_string_contains_html(self, s, contains_html):
         print(taxonomy.__dict__)

From c34cc9c1b3d2fda3ecd3d2b8dfbb2f48006e47a5 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Tue, 7 Jan 2025 15:12:54 -0500
Subject: [PATCH 06/11] Update chunkers.py with minor cleanup

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 144 +-------------------------
 1 file changed, 1 insertion(+), 143 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 257c9cea..5fe1f5d5 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -80,148 +80,6 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
     return dict(document_dict)
 
 
-# class DocumentChunker:
-#     """A factory chunker class that instantiates the applicable chunker
-
-#     Currently, only Markdown and PDF are supported. For Markdown, returns
-#     TextSplitChunker, and for PDF, returns ContextAwareChunker"""
-
-#     def __new__(
-#         cls,
-#         doc_filepaths: List[Path],
-#         output_dir: Path,
-#         server_ctx_size=4096,
-#         chunk_word_count=1024,
-#         tokenizer_model_name: Optional[str] = None,
-#         docling_model_path: Optional[str] = None,
-#     ):
-#         """Insantiate the appropriate chunker for the provided document
-
-#         Args:
-#             leaf_node: a leaf node dict containing "documents",
-#                 "filepaths", and "taxonomy_path" keys
-#             output_dir (Path): directory where artifacts should be stored
-#             server_ctx_size (int): Context window size of server
-#             chunk_word_count (int): Maximum number of words to chunk a document
-#             tokenizer_model_name (Optional[str]): name of huggingface model to get
-#                 tokenizer from
-#         Returns:
-#             TextSplitChunker | ContextAwareChunker: Object of the appropriate
-#                 chunker class for the provided filetype
-#         """
-#         documents = leaf_node[0]["documents"]
-
-#         if not isinstance(taxonomy_path, Path):
-#             taxonomy_path = Path(taxonomy_path)
-
-#         if isinstance(documents, str):
-#             documents = [documents]
-#             logger.info(
-#                 "Converted single string into a list of string. Assumed the string passed in is the document. Normally, chunk_document() should take a list as input."
-#             )
-#         elif not isinstance(documents, list):
-#             raise TypeError(
-#                 "Expected: documents to be a list, but got {}".format(type(documents))
-#             )
-
-#         filepaths = leaf_node[0]["filepaths"]
-
-#         doc_dict = cls._split_docs_by_filetype(documents, filepaths)
-#         if len(doc_dict.keys()) > 1:
-#             raise ValueError("Received multiple document types")
-#         if len(doc_dict.keys()) < 1:
-#             raise ValueError("Received no document types")
-
-#         if SupportedFileTypes.MD in doc_dict:
-#             doc_contents = [d for d, _ in doc_dict[SupportedFileTypes.MD]]
-#             # return TextSplitChunker(
-#             #     doc_contents,
-#             #     server_ctx_size,
-#             #     chunk_word_count,
-#             #     output_dir,
-#             # )
-
-#             # TODO CHUNK AS MARKDOWN
-#             pass
-
-#         if SupportedFileTypes.PDF in doc_dict:
-#             doc_paths = [p for _, p in doc_dict[SupportedFileTypes.PDF]]
-#             # return ContextAwareChunker(
-#             #     doc_paths,
-#             #     filepaths,
-#             #     output_dir,
-#             #     chunk_word_count,
-#             #     tokenizer_model_name,
-#             #     docling_model_path=docling_model_path,
-#             # )
-
-#             # TODO CHUNK AS PDF
-#             pass
-
-#     @staticmethod
-#     def _split_docs_by_filetype(
-#         documents: List[str], filepaths: List[Path]
-#     ) -> DefaultDict[SupportedFileTypes, List[Tuple[str, Path]]]:
-#         """Separate documents into lists based on their filetype.
-
-#         Currently, only Markdown and PDF are supported.
-#         Args:
-#             documents (List[str]): A list of the document contents as strings
-#             filepaths (List[Path]): Corresponding document filepaths
-#         Returns:
-#             DefaultDict: Dictionary with either ".md" or ".pdf" as a key.
-#                 Markdown items contain document contents, PDF items contain
-#                 paths to documents.
-#         """
-#         doc_dict = defaultdict(list)
-#         for doc, path in zip(documents, filepaths):
-#             if path.suffix == ".md":
-#                 # append doc contents
-#                 doc_dict[SupportedFileTypes.MD].append((doc, path))
-#             elif path.suffix == ".pdf":
-#                 # append doc paths
-#                 doc_dict[SupportedFileTypes.PDF].append((doc, path))
-#             else:
-#                 raise ValueError(
-#                     f"Received document of type .{path.suffix}, which is not a supported filetype"
-#                 )
-#         return doc_dict
-
-
-# class TextSplitChunker(ChunkerBase):
-#     def __init__(
-#         self,
-#         document_contents: List | str,
-#         server_ctx_size: int,
-#         chunk_word_count: int,
-#         output_dir: Path,
-#     ):
-#         self.document_contents = document_contents
-#         self.server_ctx_size = server_ctx_size
-#         self.chunk_word_count = chunk_word_count
-#         self.output_dir = output_dir
-
-#     def chunk_documents(self) -> List:
-#         """Naively chunk markdown documents based on the word count provided by the user.
-#         Returns:
-#             List[str]: List of chunked documents.
-#         """
-#         num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count)
-#         if num_tokens_per_doc > int(self.server_ctx_size - 1024):
-#             raise ValueError(
-#                 "Error: {}".format(
-#                     str(
-#                         f"Given word count ({self.chunk_word_count}) per doc will exceed the server context window size ({self.server_ctx_size})"
-#                     )
-#                 )
-#             )
-#         if self.document_contents == []:
-#             return []
-
-#         chunk_size = _num_chars_from_tokens(num_tokens_per_doc)
-#         return chunk_markdowns(self.document_contents, chunk_size)
-
-
 class DocumentChunker:  # pylint: disable=too-many-instance-attributes
     # def __new__(
     #     cls,
@@ -243,7 +101,7 @@ def __init__(
         server_ctx_size: int = 4096,
         chunk_word_count: int = 1024,
     ):
-        if len(document_paths) == 0:
+        if not document_paths:
             raise ValueError("Provided empty list of documents")
 
         document_dict = split_docs_by_filetype(document_paths)

From cbfd47aad56304169db588fc936f042dcb5608d5 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Tue, 7 Jan 2025 16:39:58 -0500
Subject: [PATCH 07/11] Update test_chunkers to be more modular

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 tests/functional/test_chunkers.py | 69 +++++++++++++++----------------
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
index d364ac48..f06db504 100644
--- a/tests/functional/test_chunkers.py
+++ b/tests/functional/test_chunkers.py
@@ -8,56 +8,53 @@
 # First Party
 from instructlab.sdg.utils.chunkers import DocumentChunker
 
+# Constants for Test Directory and Test Documents
 TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "testdata")
+TEST_DOCUMENTS = {
+    "pdf": "sample_documents/phoenix.pdf",
+    "md": "sample_documents/phoenix.md",
+}
+
+
+@pytest.fixture(scope="module")
+def test_paths():
+    """Fixture to return paths to test documents."""
+    return {
+        doc_type: Path(os.path.join(TEST_DATA_DIR, path))
+        for doc_type, path in TEST_DOCUMENTS.items()
+    }
 
 
 @pytest.fixture
 def tokenizer_model_name():
+    """Fixture to return the path to the tokenizer model."""
     return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab")
 
 
-def test_chunk_pdf(tmp_path, tokenizer_model_name):
-    pdf_path = Path(os.path.join(TEST_DATA_DIR, "sample_documents", "phoenix.pdf"))
-    leaf_node = [
-        {
-            "documents": ["Lorem ipsum"],
-            "filepaths": [pdf_path],
-            "taxonomy_path": "knowledge",
-        }
-    ]
-    chunker = DocumentChunker(
-        document_paths=[pdf_path],
-        output_dir=tmp_path,
-        tokenizer_model_name=tokenizer_model_name,
-        server_ctx_size=4096,
-        chunk_word_count=500,
-    )
-    chunks = chunker.chunk_documents()
-    assert len(chunks) > 9
-    assert "Phoenix is a minor constellation" in chunks[0]
-    for chunk in chunks:
-        # inexact sanity-checking of chunk max length
-        assert len(chunk) < 2500
-
-
-def test_chunk_md(tmp_path, tokenizer_model_name):
-    markdown_path = Path(os.path.join(TEST_DATA_DIR, "sample_documents", "phoenix.md"))
-    leaf_node = [
-        {
-            "documents": [markdown_path.read_text(encoding="utf-8")],
-            "filepaths": [markdown_path],
-            "taxonomy_path": "knowledge",
-        }
-    ]
+@pytest.mark.parametrize(
+    "doc_type, expected_chunks, contains_text",
+    [
+        ("pdf", 9, "Phoenix is a minor constellation"),
+        ("md", 7, None),  # Assuming there's no specific text to check in Markdown
+    ],
+)
+def test_chunk_documents(
+    tmp_path, tokenizer_model_name, test_paths, doc_type, expected_chunks, contains_text
+):
+    """
+    Generalized test function for chunking documents.
+    """
+    document_path = test_paths[doc_type]
     chunker = DocumentChunker(
-        document_paths=[markdown_path],
+        document_paths=[document_path],
         output_dir=tmp_path,
         tokenizer_model_name=tokenizer_model_name,
         server_ctx_size=4096,
         chunk_word_count=500,
     )
     chunks = chunker.chunk_documents()
-    assert len(chunks) > 7
+    assert len(chunks) > expected_chunks
+    if contains_text:
+        assert contains_text in chunks[0]
     for chunk in chunks:
-        # inexact sanity-checking of chunk max length
         assert len(chunk) < 2500

From 9581c2d84b2d96d50d07d59b7d65c1d90824a27a Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Wed, 8 Jan 2025 10:38:11 -0500
Subject: [PATCH 08/11] Update HTML error to warning to avoid exiting

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 src/instructlab/sdg/utils/taxonomy.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index 8db131d3..00a4b1ba 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -174,12 +174,10 @@ def _get_documents(
                             with open(file_path, "r", encoding="utf-8") as file:
                                 content = file.read()
                                 if _string_contains_html(content):
-                                    raise ValueError(
-                                        f"Provided markdown file {file_path} contains"
-                                        " HTML, which is currently unsupported. Please"
-                                        " format your markdown documents without the"
-                                        " use of HTML or use a different document"
-                                        " filetype."
+                                    logging.warning(
+                                        f"Provided markdown file {file_path} contains HTML contents, which is currently unsupported as a part of markdown"
+                                        "NOTE: Continuing this might affect your data generation quality."
+                                        "To get best results please format your markdown documents without the use of HTML or use a different document filetype."
                                     )
                                 file_contents.append(content)
                                 filepaths.append(Path(file_path))

From ed86e827c86f9cff4880db67f49cd1192c8317cd Mon Sep 17 00:00:00 2001
From: Khaled Sulayman <ksulayma@redhat.com>
Date: Mon, 13 Jan 2025 21:46:57 +0400
Subject: [PATCH 09/11] delete unused arg taxonomy_path in leaf_node_to_samples

Signed-off-by: Khaled Sulayman <ksulayma@redhat.com>
---
 src/instructlab/sdg/generate_data.py  | 3 +--
 src/instructlab/sdg/utils/taxonomy.py | 5 +----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index 533db868..badf3834 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -395,8 +395,7 @@ def generate_data(
         is_knowledge = False
         leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_")
         samples = leaf_node_to_samples(
-            leaf_node,
-            taxonomy,
+            leaf_node,  # pylint: disable=duplicate-code
             server_ctx_size,
             chunk_word_count,
             document_output_dir,
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index 00a4b1ba..2a23072f 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -436,7 +436,6 @@ def map_chunks_to_icls(chunks: List, leaf_node: Dict) -> Dataset:
 
 def _knowledge_leaf_node_to_samples(
     leaf_node,
-    taxonomy_path,  # pylint: disable=unused-argument
     server_ctx_size,
     chunk_word_count,
     document_output_dir,
@@ -475,7 +474,6 @@ def _skill_leaf_node_to_samples(leaf_node):
 
 def leaf_node_to_samples(
     leaf_node,
-    taxonomy_path,
     server_ctx_size,
     chunk_word_count,
     document_output_dir,
@@ -486,8 +484,7 @@ def leaf_node_to_samples(
         return []
     if leaf_node[0].get("documents"):
         return _knowledge_leaf_node_to_samples(
-            leaf_node,
-            taxonomy_path,
+            leaf_node,  # pylint: disable=duplicate-code
             server_ctx_size,
             chunk_word_count,
             document_output_dir,

From a692d70c2479032f7a2e61019d2ed1c4ca80b7d8 Mon Sep 17 00:00:00 2001
From: Khaled Sulayman <ksulayma@redhat.com>
Date: Tue, 14 Jan 2025 18:20:56 +0400
Subject: [PATCH 10/11] remove commented code in chunkers.py

Signed-off-by: Khaled Sulayman <ksulayma@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 5fe1f5d5..1ac24d9d 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -81,17 +81,6 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
 
 
 class DocumentChunker:  # pylint: disable=too-many-instance-attributes
-    # def __new__(
-    #     cls,
-    #     leaf_node,
-    #     taxonomy_path,
-    #     output_dir: Path,
-    #     server_ctx_size=4096,
-    #     chunk_word_count=1024,
-    #     tokenizer_model_name: Optional[str] = None,
-    #     docling_model_path: Optional[str] = None,
-    # ):
-
     def __init__(
         self,
         document_paths: List[Path],

From e3a3e1eadeea23d13fc7cba9b102e00aa5c61a39 Mon Sep 17 00:00:00 2001
From: Khaled Sulayman <ksulayma@redhat.com>
Date: Wed, 15 Jan 2025 14:32:38 +0400
Subject: [PATCH 11/11] optionally accept pathlib.Path for tokenizer_model_name

Signed-off-by: Khaled Sulayman <ksulayma@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 1ac24d9d..52c15ba1 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -85,7 +85,7 @@ def __init__(
         self,
         document_paths: List[Path],
         output_dir: Path,
-        tokenizer_model_name: str,
+        tokenizer_model_name: str | Path,
         docling_model_path: Optional[Path] = None,
         server_ctx_size: int = 4096,
         chunk_word_count: int = 1024,
@@ -228,7 +228,7 @@ def fuse_texts(
         return fused_texts
 
     @staticmethod
-    def create_tokenizer(model_name: str):
+    def create_tokenizer(model_path: str | Path):
         """
         Create a tokenizer instance from a pre-trained model or a local directory.
 
@@ -243,9 +243,8 @@ def create_tokenizer(model_name: str):
         # Third Party
         from transformers import AutoTokenizer
 
-        model_path = Path(
-            model_name
-        )  # TODO expect a path from the DocumentChunker constructor
+        if not isinstance(model_path, Path):
+            model_path = Path(model_path)
         error_info_message = (
             "Please run `ilab model download {download_args}` and try again"
         )