add subdoc summary pack (#10934)

run-llama · Feb 19, 2024 · 68639a6 · 68639a6
1 parent a607f97
commit 68639a6
Show file tree

Hide file tree

Showing 9 changed files with 584 additions and 0 deletions.
diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/BUILD b/llama-index-packs/llama-index-packs-subdoc-summary/BUILD
@@ -0,0 +1,5 @@
+python_sources()
+
+poetry_requirements(
+    name="poetry",
+)
diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/Makefile b/llama-index-packs/llama-index-packs-subdoc-summary/Makefile
@@ -0,0 +1,17 @@
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+
+help:	## Show all Makefile targets.
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
+
+format:	## Run code autoformatters (black).
+	pre-commit install
+	git ls-files | xargs pre-commit run black --files
+
+lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
+	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
+
+test:	## Run tests via pytest.
+	pytest tests
+
+watch-docs:	## Build and watch documentation.
+	sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/README.md b/llama-index-packs/llama-index-packs-subdoc-summary/README.md
@@ -0,0 +1,53 @@
+# LlamaIndex Packs Integration: Subdoc-Summary
+
+This LlamaPack provides an advanced technique for injecting each chunk with "sub-document" metadata. This context augmentation technique is helpful for both retrieving relevant context and for synthesizing correct answers.
+
+It is a step beyond simply adding a summary of the document as the metadata to each chunk. Within a long document, there can be multiple distinct themes, and we want each chunk to be grounded in global but relevant context.
+
+This technique was inspired by our "Practical Tips and Tricks" video: https://www.youtube.com/watch?v=ZP1F9z-S7T0.
+
+## Installation
+
+```bash
+pip install llama-index llama-index-packs-subdoc-summary
+```
+
+## CLI Usage
+
+You can download llamapacks directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamapack SubDocSummaryPack --download-dir ./subdoc_summary_pack
+```
+
+You can then inspect the files at `./subdoc_summary_pack` and use them as a template for your own project.
+
+## Code Usage
+
+You can download the pack to a the `./subdoc_summary_pack` directory:
+
+```python
+from llama_index.core.llama_pack import download_llama_pack
+
+# download and install dependencies
+SubDocSummaryPack = download_llama_pack(
+    "SubDocSummaryPack", "./subdoc_summary_pack"
+)
+
+# You can use any llama-hub loader to get documents!
+subdoc_summary_pack = SubDocSummaryPack(
+    documents,
+    parent_chunk_size=8192,  # default,
+    child_chunk_size=512,  # default
+    llm=OpenAI(model="gpt-3.5-turbo"),
+    embed_model=OpenAIEmbedding(),
+)
+```
+
+Initializing the pack will split documents into parent chunks and child chunks. It will inject parent chunk summaries into child chunks, and index the child chunks.
+
+Running the pack will run the query engine over the vectorized child chunks.
+
+```python
+response = subdoc_summary_pack.run("<query>", similarity_top_k=2)
+```
diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/examples/subdoc-summary.ipynb b/llama-index-packs/llama-index-packs-subdoc-summary/examples/subdoc-summary.ipynb
diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/BUILD b/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/...index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/__init__.py b/...index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/__init__.py
@@ -0,0 +1,4 @@
+from llama_index.packs.subdoc_summary.base import SubDocSummaryPack
+
+
+__all__ = ["SubDocSummaryPack"]
diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/base.py b/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/base.py
@@ -0,0 +1,93 @@
+"""Subdoc Summary."""
+
+from typing import Any, Dict, List, Optional, List
+
+from llama_index.core.llama_pack import BaseLlamaPack
+from llama_index.core.schema import Document
+from llama_index.core.text_splitter import SentenceSplitter
+from llama_index.core.utils import print_text
+from llama_index.core import SummaryIndex, VectorStoreIndex
+from llama_index.core.embeddings import BaseEmbedding
+from llama_index.core.llms import LLM
+
+
+DEFAULT_SUMMARY_PROMPT_STR = """\
+Please give a concise summary of the context in 1-2 sentences.
+"""
+
+
+class SubDocSummaryPack(BaseLlamaPack):
+    """Pack for injecting sub-doc metadata into each chunk."""
+
+    def __init__(
+        self,
+        documents: List[Document],
+        parent_chunk_size: int = 8192,
+        parent_chunk_overlap: int = 512,
+        child_chunk_size: int = 512,
+        child_chunk_overlap: int = 32,
+        summary_prompt_str: str = DEFAULT_SUMMARY_PROMPT_STR,
+        verbose: bool = False,
+        embed_model: Optional[BaseEmbedding] = None,
+        llm: Optional[LLM] = None,
+    ) -> None:
+        """Init params."""
+        self.parent_chunk_size = parent_chunk_size
+        self.child_chunk_size = child_chunk_size
+
+        self.parent_splitter = SentenceSplitter(
+            chunk_size=parent_chunk_size, chunk_overlap=parent_chunk_overlap
+        )
+        self.child_splitter = SentenceSplitter(
+            chunk_size=child_chunk_size, chunk_overlap=child_chunk_overlap
+        )
+
+        self.summary_prompt_str = summary_prompt_str
+        self.embed_model = embed_model
+        self.llm = llm
+
+        parent_nodes = self.parent_splitter.get_nodes_from_documents(documents)
+        all_child_nodes = []
+        # For each parent node, extract the child nodes and print the text
+        for idx, parent_node in enumerate(parent_nodes):
+            if verbose:
+                print_text(
+                    f"> Processing parent chunk {idx + 1} of {len(parent_nodes)}\n",
+                    color="blue",
+                )
+            # get summary
+            summary_index = SummaryIndex([parent_node])
+            summary_query_engine = summary_index.as_query_engine(
+                response_mode="tree_summarize"
+            )
+            parent_summary = summary_query_engine.query(DEFAULT_SUMMARY_PROMPT_STR)
+            if verbose:
+                print_text(f"Extracted summary: {parent_summary}\n", color="pink")
+
+            # attach summary to all child nodes
+            child_nodes = self.child_splitter.get_nodes_from_documents([parent_node])
+            for child_node in child_nodes:
+                child_node.metadata["context_summary"] = str(parent_summary)
+
+            all_child_nodes.extend(child_nodes)
+
+        # build vector index for child nodes
+        self.vector_index = VectorStoreIndex(
+            all_child_nodes, embed_model=self.embed_model
+        )
+        self.vector_retriever = self.vector_index.as_retriever()
+        self.vector_query_engine = self.vector_index.as_query_engine(llm=llm)
+
+        self.verbose = verbose
+
+    def get_modules(self) -> Dict[str, Any]:
+        """Get modules."""
+        return {
+            "vector_index": self.vector_index,
+            "vector_retriever": self.vector_retriever,
+            "vector_query_engine": self.vector_query_engine,
+        }
+
+    def run(self, *args: Any, **kwargs: Any) -> Any:
+        """Run the pipeline."""
+        return self.vector_query_engine.query(*args, **kwargs)
diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/pyproject.toml b/llama-index-packs/llama-index-packs-subdoc-summary/pyproject.toml
@@ -0,0 +1,54 @@
+[build-system]
+build-backend = "poetry.core.masonry.api"
+requires = ["poetry-core"]
+
+[tool.codespell]
+check-filenames = true
+check-hidden = true
+# Feel free to un-skip examples, and experimental, you will just need to
+# work through many typos (--write-changes and --interactive will help)
+skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
+
+[tool.llamahub]
+classes = ["SubDocSummaryPack"]
+contains_example = false
+import_path = "llama_index.packs.subdoc_summary"
+
+[tool.mypy]
+disallow_untyped_defs = true
+# Remove venv skip when integrated with pre-commit
+exclude = ["_static", "build", "examples", "notebooks", "venv"]
+ignore_missing_imports = true
+python_version = "3.8"
+
+[tool.poetry]
+authors = ["Your Name <[email protected]>"]
+description = "llama-index packs subdoc-summary implementation"
+license = "MIT"
+name = "llama-index-packs-subdoc-summary"
+packages = [{include = "llama_index/"}]
+readme = "README.md"
+version = "0.1.0"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<3.12"
+llama-index-core = "^0.10.0"
+
+[tool.poetry.group.dev.dependencies]
+black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"}
+codespell = {extras = ["toml"], version = ">=v2.2.6"}
+ipython = "8.10.0"
+jupyter = "^1.0.0"
+mypy = "0.991"
+pre-commit = "3.2.0"
+pylint = "2.15.10"
+pytest = "7.2.1"
+pytest-mock = "3.11.1"
+ruff = "0.0.292"
+tree-sitter-languages = "^1.8.0"
+types-Deprecated = ">=0.1.0"
+types-PyYAML = "^6.0.12.12"
+types-protobuf = "^4.24.0.4"
+types-redis = "4.5.5.0"
+types-requests = "2.28.11.8"  # TODO: unpin when mypy>0.991
+types-setuptools = "67.1.0.0"
diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/tests/__init__.py b/llama-index-packs/llama-index-packs-subdoc-summary/tests/__init__.py