Skip to content
This repository has been archived by the owner on Sep 11, 2024. It is now read-only.

Commit

Permalink
refactor: refactor LlamaIndex extensions
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas committed Sep 4, 2024
1 parent 6acb898 commit 412c1be
Show file tree
Hide file tree
Showing 18 changed files with 853 additions and 658 deletions.
35 changes: 19 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,25 +59,26 @@ import os
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
from quackling.llama_index.node_parsers.hier_node_parser import HierarchicalNodeParser
from quackling.llama_index.readers.docling_reader import DoclingReader
from quackling.llama_index.node_parsers import HierarchicalJSONNodeParser
from quackling.llama_index.readers import DoclingPDFReader

DOCS = ["https://arxiv.org/pdf/2311.18481"]
QUERY = "What is DocQA?"
DOCS = ["https://arxiv.org/pdf/2206.01062"]
QUESTION = "How many pages were human annotated?"
EMBED_MODEL = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
LLM = HuggingFaceInferenceAPI(
token=os.getenv("HF_TOKEN"),
model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
)

index = VectorStoreIndex.from_documents(
documents=DoclingReader(parse_type=DoclingReader.ParseType.JSON).load_data(DOCS),
documents=DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON).load_data(DOCS),
embed_model=EMBED_MODEL,
transformations=[HierarchicalNodeParser()],
transformations=[HierarchicalJSONNodeParser()],
)
query_engine = index.as_query_engine(llm=LLM)
response = query_engine.query(QUERY)
# > DocQA is a question-answering conversational assistant [...]
result = query_engine.query(QUESTION)
print(result.response)
# > 80K pages were human annotated
```

### Chunking
Expand All @@ -88,7 +89,7 @@ to Docling document's nodes:

```python
from docling.document_converter import DocumentConverter
from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker
from quackling.core.chunkers import HierarchicalChunker

doc = DocumentConverter().convert_single("https://arxiv.org/pdf/2408.09869").output
chunks = list(HierarchicalChunker().chunk(doc))
Expand Down Expand Up @@ -120,13 +121,15 @@ Please read [Contributing to Quackling](./CONTRIBUTING.md) for details.
If you use Quackling in your projects, please consider citing the following:

```bib
@software{Docling,
author = {Deep Search Team},
month = {7},
title = {{Docling}},
url = {https://github.com/DS4SD/docling},
version = {main},
year = {2024}
@techreport{Docling,
author = "Deep Search Team",
month = 8,
title = "Docling Technical Report",
url = "https://arxiv.org/abs/2408.09869",
eprint = "2408.09869",
doi = "10.48550/arXiv.2408.09869",
version = "1.0.0",
year = 2024
}
```

Expand Down
102 changes: 60 additions & 42 deletions examples/basic_pipeline.ipynb

Large diffs are not rendered by default.

214 changes: 89 additions & 125 deletions examples/hybrid_pipeline.ipynb

Large diffs are not rendered by default.

183 changes: 81 additions & 102 deletions examples/native_nodes.ipynb

Large diffs are not rendered by default.

237 changes: 112 additions & 125 deletions examples/node_transformations.ipynb

Large diffs are not rendered by default.

487 changes: 336 additions & 151 deletions examples/prev_next_augmentation.ipynb

Large diffs are not rendered by default.

142 changes: 83 additions & 59 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ llama-index-llms-huggingface-api = { version = "^0.2.0", optional = true }
llama-index-vector-stores-milvus = { version = "^0.2.1", optional = true }
llama-index-postprocessor-flag-embedding-reranker = {version = "^0.2.0", optional = true }
flagembedding = { version = "^1.2.10", optional = true }
peft = { version = "^0.12.0", optional = true } # TODO: remove once we can update FlagEmbedding past 1.2.11 to include https://github.com/FlagOpen/FlagEmbedding/commit/1613625417e293bf98311cb8ae0819a0a3af5297
jsonpath-ng = { version = "^1.6.1", optional = true }

##############
Expand All @@ -69,6 +70,7 @@ examples = [
"llama-index-vector-stores-milvus",
"llama-index-postprocessor-flag-embedding-reranker",
"flagembedding",
"peft", # TODO: remove once we can update FlagEmbedding past 1.2.11 to include https://github.com/FlagOpen/FlagEmbedding/commit/1613625417e293bf98311cb8ae0819a0a3af5297
"jsonpath-ng",
]

Expand Down
2 changes: 2 additions & 0 deletions quackling/core/chunkers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker # noqa
4 changes: 4 additions & 0 deletions quackling/llama_index/node_parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,7 @@
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

from quackling.llama_index.node_parsers.hier_node_parser import ( # noqa
HierarchicalJSONNodeParser,
)
10 changes: 9 additions & 1 deletion quackling/llama_index/node_parsers/hier_node_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,15 @@
)
from llama_index.core.utils import get_tqdm_iterable
from pydantic import Field
from typing_extensions import deprecated

from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker
from quackling.core.chunkers import HierarchicalChunker
from quackling.llama_index.node_parsers.base import NodeMetadata


@deprecated(
"Use `quackling.llama_index.node_parsers.HierarchicalJSONNodeParser` instead."
)
class HierarchicalNodeParser(NodeParser):

# override default to False to avoid inheriting source doc's metadata
Expand Down Expand Up @@ -79,3 +83,7 @@ def _parse_nodes(
).model_dump()
all_nodes.append(node)
return all_nodes


class HierarchicalJSONNodeParser(HierarchicalNodeParser):
pass
Loading

0 comments on commit 412c1be

Please sign in to comment.