refactor: refactor LlamaIndex extensions

Signed-off-by: Panos Vagenas <[email protected]>
DS4SD · Sep 4, 2024 · 412c1be · 412c1be
1 parent 6acb898
commit 412c1be
Show file tree

Hide file tree

Showing 18 changed files with 853 additions and 658 deletions.
diff --git a/README.md b/README.md
@@ -59,25 +59,26 @@ import os
 from llama_index.core import VectorStoreIndex
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
-from quackling.llama_index.node_parsers.hier_node_parser import HierarchicalNodeParser
-from quackling.llama_index.readers.docling_reader import DoclingReader
+from quackling.llama_index.node_parsers import HierarchicalJSONNodeParser
+from quackling.llama_index.readers import DoclingPDFReader
 
-DOCS = ["https://arxiv.org/pdf/2311.18481"]
-QUERY = "What is DocQA?"
+DOCS = ["https://arxiv.org/pdf/2206.01062"]
+QUESTION = "How many pages were human annotated?"
 EMBED_MODEL = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
 LLM = HuggingFaceInferenceAPI(
     token=os.getenv("HF_TOKEN"),
     model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
 )
 
 index = VectorStoreIndex.from_documents(
-    documents=DoclingReader(parse_type=DoclingReader.ParseType.JSON).load_data(DOCS),
+    documents=DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON).load_data(DOCS),
     embed_model=EMBED_MODEL,
-    transformations=[HierarchicalNodeParser()],
+    transformations=[HierarchicalJSONNodeParser()],
 )
 query_engine = index.as_query_engine(llm=LLM)
-response = query_engine.query(QUERY)
-# > DocQA is a question-answering conversational assistant [...]
+result = query_engine.query(QUESTION)
+print(result.response)
+# > 80K pages were human annotated
 ```
 
 ### Chunking
@@ -88,7 +89,7 @@ to Docling document's nodes:
 
 ```python
 from docling.document_converter import DocumentConverter
-from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker
+from quackling.core.chunkers import HierarchicalChunker
 
 doc = DocumentConverter().convert_single("https://arxiv.org/pdf/2408.09869").output
 chunks = list(HierarchicalChunker().chunk(doc))
@@ -120,13 +121,15 @@ Please read [Contributing to Quackling](./CONTRIBUTING.md) for details.
 If you use Quackling in your projects, please consider citing the following:
 
 ```bib
-@software{Docling,
-author = {Deep Search Team},
-month = {7},
-title = {{Docling}},
-url = {https://github.com/DS4SD/docling},
-version = {main},
-year = {2024}
+@techreport{Docling,
+  author = "Deep Search Team",
+  month = 8,
+  title = "Docling Technical Report",
+  url = "https://arxiv.org/abs/2408.09869",
+  eprint = "2408.09869",
+  doi = "10.48550/arXiv.2408.09869",
+  version = "1.0.0",
+  year = 2024
 }
 ```
 

diff --git a/examples/basic_pipeline.ipynb b/examples/basic_pipeline.ipynb
diff --git a/examples/hybrid_pipeline.ipynb b/examples/hybrid_pipeline.ipynb
diff --git a/examples/native_nodes.ipynb b/examples/native_nodes.ipynb
diff --git a/examples/node_transformations.ipynb b/examples/node_transformations.ipynb
diff --git a/examples/prev_next_augmentation.ipynb b/examples/prev_next_augmentation.ipynb
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -47,6 +47,7 @@ llama-index-llms-huggingface-api = { version = "^0.2.0", optional = true }
 llama-index-vector-stores-milvus = { version = "^0.2.1", optional = true }
 llama-index-postprocessor-flag-embedding-reranker  = {version = "^0.2.0", optional = true }
 flagembedding = { version = "^1.2.10", optional = true }
+peft = { version = "^0.12.0", optional = true }  # TODO: remove once we can update FlagEmbedding past 1.2.11 to include https://github.com/FlagOpen/FlagEmbedding/commit/1613625417e293bf98311cb8ae0819a0a3af5297
 jsonpath-ng = { version = "^1.6.1", optional = true }
 
 ##############
@@ -69,6 +70,7 @@ examples = [
     "llama-index-vector-stores-milvus",
     "llama-index-postprocessor-flag-embedding-reranker",
     "flagembedding",
+    "peft",  # TODO: remove once we can update FlagEmbedding past 1.2.11 to include https://github.com/FlagOpen/FlagEmbedding/commit/1613625417e293bf98311cb8ae0819a0a3af5297
     "jsonpath-ng",
 ]
 

diff --git a/quackling/core/chunkers/__init__.py b/quackling/core/chunkers/__init__.py
@@ -2,3 +2,5 @@
 # Copyright IBM Corp. 2024 - 2024
 # SPDX-License-Identifier: MIT
 #
+
+from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker  # noqa
diff --git a/quackling/llama_index/node_parsers/__init__.py b/quackling/llama_index/node_parsers/__init__.py
@@ -2,3 +2,7 @@
 # Copyright IBM Corp. 2024 - 2024
 # SPDX-License-Identifier: MIT
 #
+
+from quackling.llama_index.node_parsers.hier_node_parser import (  # noqa
+    HierarchicalJSONNodeParser,
+)
diff --git a/quackling/llama_index/node_parsers/hier_node_parser.py b/quackling/llama_index/node_parsers/hier_node_parser.py
@@ -19,11 +19,15 @@
 )
 from llama_index.core.utils import get_tqdm_iterable
 from pydantic import Field
+from typing_extensions import deprecated
 
-from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker
+from quackling.core.chunkers import HierarchicalChunker
 from quackling.llama_index.node_parsers.base import NodeMetadata
 
 
+@deprecated(
+    "Use `quackling.llama_index.node_parsers.HierarchicalJSONNodeParser` instead."
+)
 class HierarchicalNodeParser(NodeParser):
 
     # override default to False to avoid inheriting source doc's metadata
@@ -79,3 +83,7 @@ def _parse_nodes(
                 ).model_dump()
                 all_nodes.append(node)
         return all_nodes
+
+
+class HierarchicalJSONNodeParser(HierarchicalNodeParser):
+    pass