-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
584 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
python_sources() | ||
|
||
poetry_requirements( | ||
name="poetry", | ||
) |
17 changes: 17 additions & 0 deletions
17
llama-index-packs/llama-index-packs-subdoc-summary/Makefile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
GIT_ROOT ?= $(shell git rev-parse --show-toplevel) | ||
|
||
help: ## Show all Makefile targets. | ||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' | ||
|
||
format: ## Run code autoformatters (black). | ||
pre-commit install | ||
git ls-files | xargs pre-commit run black --files | ||
|
||
lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy | ||
pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files | ||
|
||
test: ## Run tests via pytest. | ||
pytest tests | ||
|
||
watch-docs: ## Build and watch documentation. | ||
sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ |
53 changes: 53 additions & 0 deletions
53
llama-index-packs/llama-index-packs-subdoc-summary/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# LlamaIndex Packs Integration: Subdoc-Summary | ||
|
||
This LlamaPack provides an advanced technique for injecting each chunk with "sub-document" metadata. This context augmentation technique is helpful for both retrieving relevant context and for synthesizing correct answers. | ||
|
||
It is a step beyond simply adding a summary of the document as the metadata to each chunk. Within a long document, there can be multiple distinct themes, and we want each chunk to be grounded in global but relevant context. | ||
|
||
This technique was inspired by our "Practical Tips and Tricks" video: https://www.youtube.com/watch?v=ZP1F9z-S7T0. | ||
|
||
## Installation | ||
|
||
```bash | ||
pip install llama-index llama-index-packs-subdoc-summary | ||
``` | ||
|
||
## CLI Usage | ||
|
||
You can download llamapacks directly using `llamaindex-cli`, which comes installed with the `llama-index` python package: | ||
|
||
```bash | ||
llamaindex-cli download-llamapack SubDocSummaryPack --download-dir ./subdoc_summary_pack | ||
``` | ||
|
||
You can then inspect the files at `./subdoc_summary_pack` and use them as a template for your own project. | ||
|
||
## Code Usage | ||
|
||
You can download the pack to a the `./subdoc_summary_pack` directory: | ||
|
||
```python | ||
from llama_index.core.llama_pack import download_llama_pack | ||
|
||
# download and install dependencies | ||
SubDocSummaryPack = download_llama_pack( | ||
"SubDocSummaryPack", "./subdoc_summary_pack" | ||
) | ||
|
||
# You can use any llama-hub loader to get documents! | ||
subdoc_summary_pack = SubDocSummaryPack( | ||
documents, | ||
parent_chunk_size=8192, # default, | ||
child_chunk_size=512, # default | ||
llm=OpenAI(model="gpt-3.5-turbo"), | ||
embed_model=OpenAIEmbedding(), | ||
) | ||
``` | ||
|
||
Initializing the pack will split documents into parent chunks and child chunks. It will inject parent chunk summaries into child chunks, and index the child chunks. | ||
|
||
Running the pack will run the query engine over the vectorized child chunks. | ||
|
||
```python | ||
response = subdoc_summary_pack.run("<query>", similarity_top_k=2) | ||
``` |
357 changes: 357 additions & 0 deletions
357
llama-index-packs/llama-index-packs-subdoc-summary/examples/subdoc-summary.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
1 change: 1 addition & 0 deletions
1
llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/BUILD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
python_sources() |
4 changes: 4 additions & 0 deletions
4
...index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from llama_index.packs.subdoc_summary.base import SubDocSummaryPack | ||
|
||
|
||
__all__ = ["SubDocSummaryPack"] |
93 changes: 93 additions & 0 deletions
93
llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/base.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
"""Subdoc Summary.""" | ||
|
||
from typing import Any, Dict, List, Optional, List | ||
|
||
from llama_index.core.llama_pack import BaseLlamaPack | ||
from llama_index.core.schema import Document | ||
from llama_index.core.text_splitter import SentenceSplitter | ||
from llama_index.core.utils import print_text | ||
from llama_index.core import SummaryIndex, VectorStoreIndex | ||
from llama_index.core.embeddings import BaseEmbedding | ||
from llama_index.core.llms import LLM | ||
|
||
|
||
DEFAULT_SUMMARY_PROMPT_STR = """\ | ||
Please give a concise summary of the context in 1-2 sentences. | ||
""" | ||
|
||
|
||
class SubDocSummaryPack(BaseLlamaPack): | ||
"""Pack for injecting sub-doc metadata into each chunk.""" | ||
|
||
def __init__( | ||
self, | ||
documents: List[Document], | ||
parent_chunk_size: int = 8192, | ||
parent_chunk_overlap: int = 512, | ||
child_chunk_size: int = 512, | ||
child_chunk_overlap: int = 32, | ||
summary_prompt_str: str = DEFAULT_SUMMARY_PROMPT_STR, | ||
verbose: bool = False, | ||
embed_model: Optional[BaseEmbedding] = None, | ||
llm: Optional[LLM] = None, | ||
) -> None: | ||
"""Init params.""" | ||
self.parent_chunk_size = parent_chunk_size | ||
self.child_chunk_size = child_chunk_size | ||
|
||
self.parent_splitter = SentenceSplitter( | ||
chunk_size=parent_chunk_size, chunk_overlap=parent_chunk_overlap | ||
) | ||
self.child_splitter = SentenceSplitter( | ||
chunk_size=child_chunk_size, chunk_overlap=child_chunk_overlap | ||
) | ||
|
||
self.summary_prompt_str = summary_prompt_str | ||
self.embed_model = embed_model | ||
self.llm = llm | ||
|
||
parent_nodes = self.parent_splitter.get_nodes_from_documents(documents) | ||
all_child_nodes = [] | ||
# For each parent node, extract the child nodes and print the text | ||
for idx, parent_node in enumerate(parent_nodes): | ||
if verbose: | ||
print_text( | ||
f"> Processing parent chunk {idx + 1} of {len(parent_nodes)}\n", | ||
color="blue", | ||
) | ||
# get summary | ||
summary_index = SummaryIndex([parent_node]) | ||
summary_query_engine = summary_index.as_query_engine( | ||
response_mode="tree_summarize" | ||
) | ||
parent_summary = summary_query_engine.query(DEFAULT_SUMMARY_PROMPT_STR) | ||
if verbose: | ||
print_text(f"Extracted summary: {parent_summary}\n", color="pink") | ||
|
||
# attach summary to all child nodes | ||
child_nodes = self.child_splitter.get_nodes_from_documents([parent_node]) | ||
for child_node in child_nodes: | ||
child_node.metadata["context_summary"] = str(parent_summary) | ||
|
||
all_child_nodes.extend(child_nodes) | ||
|
||
# build vector index for child nodes | ||
self.vector_index = VectorStoreIndex( | ||
all_child_nodes, embed_model=self.embed_model | ||
) | ||
self.vector_retriever = self.vector_index.as_retriever() | ||
self.vector_query_engine = self.vector_index.as_query_engine(llm=llm) | ||
|
||
self.verbose = verbose | ||
|
||
def get_modules(self) -> Dict[str, Any]: | ||
"""Get modules.""" | ||
return { | ||
"vector_index": self.vector_index, | ||
"vector_retriever": self.vector_retriever, | ||
"vector_query_engine": self.vector_query_engine, | ||
} | ||
|
||
def run(self, *args: Any, **kwargs: Any) -> Any: | ||
"""Run the pipeline.""" | ||
return self.vector_query_engine.query(*args, **kwargs) |
54 changes: 54 additions & 0 deletions
54
llama-index-packs/llama-index-packs-subdoc-summary/pyproject.toml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
[build-system] | ||
build-backend = "poetry.core.masonry.api" | ||
requires = ["poetry-core"] | ||
|
||
[tool.codespell] | ||
check-filenames = true | ||
check-hidden = true | ||
# Feel free to un-skip examples, and experimental, you will just need to | ||
# work through many typos (--write-changes and --interactive will help) | ||
skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" | ||
|
||
[tool.llamahub] | ||
classes = ["SubDocSummaryPack"] | ||
contains_example = false | ||
import_path = "llama_index.packs.subdoc_summary" | ||
|
||
[tool.mypy] | ||
disallow_untyped_defs = true | ||
# Remove venv skip when integrated with pre-commit | ||
exclude = ["_static", "build", "examples", "notebooks", "venv"] | ||
ignore_missing_imports = true | ||
python_version = "3.8" | ||
|
||
[tool.poetry] | ||
authors = ["Your Name <[email protected]>"] | ||
description = "llama-index packs subdoc-summary implementation" | ||
license = "MIT" | ||
name = "llama-index-packs-subdoc-summary" | ||
packages = [{include = "llama_index/"}] | ||
readme = "README.md" | ||
version = "0.1.0" | ||
|
||
[tool.poetry.dependencies] | ||
python = ">=3.8.1,<3.12" | ||
llama-index-core = "^0.10.0" | ||
|
||
[tool.poetry.group.dev.dependencies] | ||
black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"} | ||
codespell = {extras = ["toml"], version = ">=v2.2.6"} | ||
ipython = "8.10.0" | ||
jupyter = "^1.0.0" | ||
mypy = "0.991" | ||
pre-commit = "3.2.0" | ||
pylint = "2.15.10" | ||
pytest = "7.2.1" | ||
pytest-mock = "3.11.1" | ||
ruff = "0.0.292" | ||
tree-sitter-languages = "^1.8.0" | ||
types-Deprecated = ">=0.1.0" | ||
types-PyYAML = "^6.0.12.12" | ||
types-protobuf = "^4.24.0.4" | ||
types-redis = "4.5.5.0" | ||
types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991 | ||
types-setuptools = "67.1.0.0" |
Empty file.