From 68639a674d2db886c206062c00b434c1a192a8f6 Mon Sep 17 00:00:00 2001 From: Jerry Liu Date: Mon, 19 Feb 2024 09:13:25 -0800 Subject: [PATCH] add subdoc summary pack (#10934) --- .../llama-index-packs-subdoc-summary/BUILD | 5 + .../llama-index-packs-subdoc-summary/Makefile | 17 + .../README.md | 53 +++ .../examples/subdoc-summary.ipynb | 357 ++++++++++++++++++ .../llama_index/packs/subdoc_summary/BUILD | 1 + .../packs/subdoc_summary/__init__.py | 4 + .../llama_index/packs/subdoc_summary/base.py | 93 +++++ .../pyproject.toml | 54 +++ .../tests/__init__.py | 0 9 files changed, 584 insertions(+) create mode 100644 llama-index-packs/llama-index-packs-subdoc-summary/BUILD create mode 100644 llama-index-packs/llama-index-packs-subdoc-summary/Makefile create mode 100644 llama-index-packs/llama-index-packs-subdoc-summary/README.md create mode 100644 llama-index-packs/llama-index-packs-subdoc-summary/examples/subdoc-summary.ipynb create mode 100644 llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/BUILD create mode 100644 llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/__init__.py create mode 100644 llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/base.py create mode 100644 llama-index-packs/llama-index-packs-subdoc-summary/pyproject.toml create mode 100644 llama-index-packs/llama-index-packs-subdoc-summary/tests/__init__.py diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/BUILD b/llama-index-packs/llama-index-packs-subdoc-summary/BUILD new file mode 100644 index 0000000000000..0b67818b466ca --- /dev/null +++ b/llama-index-packs/llama-index-packs-subdoc-summary/BUILD @@ -0,0 +1,5 @@ +python_sources() + +poetry_requirements( + name="poetry", +) diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/Makefile b/llama-index-packs/llama-index-packs-subdoc-summary/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-packs/llama-index-packs-subdoc-summary/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/README.md b/llama-index-packs/llama-index-packs-subdoc-summary/README.md new file mode 100644 index 0000000000000..5a721d5c85c26 --- /dev/null +++ b/llama-index-packs/llama-index-packs-subdoc-summary/README.md @@ -0,0 +1,53 @@ +# LlamaIndex Packs Integration: Subdoc-Summary + +This LlamaPack provides an advanced technique for injecting each chunk with "sub-document" metadata. This context augmentation technique is helpful for both retrieving relevant context and for synthesizing correct answers. + +It is a step beyond simply adding a summary of the document as the metadata to each chunk. Within a long document, there can be multiple distinct themes, and we want each chunk to be grounded in global but relevant context. + +This technique was inspired by our "Practical Tips and Tricks" video: https://www.youtube.com/watch?v=ZP1F9z-S7T0. + +## Installation + +```bash +pip install llama-index llama-index-packs-subdoc-summary +``` + +## CLI Usage + +You can download llamapacks directly using `llamaindex-cli`, which comes installed with the `llama-index` python package: + +```bash +llamaindex-cli download-llamapack SubDocSummaryPack --download-dir ./subdoc_summary_pack +``` + +You can then inspect the files at `./subdoc_summary_pack` and use them as a template for your own project. + +## Code Usage + +You can download the pack to a the `./subdoc_summary_pack` directory: + +```python +from llama_index.core.llama_pack import download_llama_pack + +# download and install dependencies +SubDocSummaryPack = download_llama_pack( + "SubDocSummaryPack", "./subdoc_summary_pack" +) + +# You can use any llama-hub loader to get documents! +subdoc_summary_pack = SubDocSummaryPack( + documents, + parent_chunk_size=8192, # default, + child_chunk_size=512, # default + llm=OpenAI(model="gpt-3.5-turbo"), + embed_model=OpenAIEmbedding(), +) +``` + +Initializing the pack will split documents into parent chunks and child chunks. It will inject parent chunk summaries into child chunks, and index the child chunks. + +Running the pack will run the query engine over the vectorized child chunks. + +```python +response = subdoc_summary_pack.run("", similarity_top_k=2) +``` diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/examples/subdoc-summary.ipynb b/llama-index-packs/llama-index-packs-subdoc-summary/examples/subdoc-summary.ipynb new file mode 100644 index 0000000000000..b07ac5b76980c --- /dev/null +++ b/llama-index-packs/llama-index-packs-subdoc-summary/examples/subdoc-summary.ipynb @@ -0,0 +1,357 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8dd0acdb-5aec-4129-8772-81f56d6b25cf", + "metadata": {}, + "source": [ + "# Sub-Document Summary Metadata Pack\n", + "\n", + "This LlamaPack provides an advanced technique for injecting each chunk with \"sub-document\" metadata. This context augmentation technique is helpful for both retrieving relevant context and for synthesizing correct answers.\n", + "\n", + "It is a step beyond simply adding a summary of the document as the metadata to each chunk. Within a long document, there can be multiple distinct themes, and we want each chunk to be grounded in global but relevant context." + ] + }, + { + "cell_type": "markdown", + "id": "66818da6-a3fb-4537-b30a-922a8a0ef99e", + "metadata": {}, + "source": [ + "## Setup Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "317a3207-1211-4a6a-bd7d-3ab14f399951", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "811.82s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n", + "817.00s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " % Total % Received % Xferd Average Speed Time Time Time Current\n", + " Dload Upload Total Spent Left Speed\n", + "100 13.0M 100 13.0M 0 0 27.7M 0 --:--:-- --:--:-- --:--:-- 28.0M\n" + ] + } + ], + "source": [ + "!mkdir -p 'data/'\n", + "!curl 'https://arxiv.org/pdf/2307.09288.pdf' -o 'data/llama2.pdf'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf6ab9c0-c993-4ab2-8343-b294676d7550", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core import SimpleDirectoryReader\n", + "\n", + "documents = SimpleDirectoryReader(\"data\").load_data()" + ] + }, + { + "cell_type": "markdown", + "id": "98bfbe4b-539c-469c-82e6-1f823f28d5f4", + "metadata": {}, + "source": [ + "## Run the Sub-Document Summary Metadata Pack" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af4b815e-f5ce-406b-9dcb-5a23fc9f96db", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install llama-index-packs-subdoc-summary llama-index-llms-openai llama-index-embeddings-openai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d619362b-ae45-4e47-b400-1c2ce7262496", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.packs.subdoc_summary import SubDocSummaryPack\n", + "from llama_index.llms.openai import OpenAI\n", + "from llama_index.embeddings.openai import OpenAIEmbedding\n", + "\n", + "subdoc_summary_pack = SubDocSummaryPack(\n", + " documents,\n", + " parent_chunk_size=8192, # default,\n", + " child_chunk_size=512, # default\n", + " llm=OpenAI(model=\"gpt-3.5-turbo\"),\n", + " embed_model=OpenAIEmbedding(),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb11a60d-d356-40c5-84c1-4135382bfbfd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "Llama 2 was pretrained using an optimized auto-regressive transformer with robust data cleaning, updated data mixes, training on 40% more total tokens, doubling the context length, and using grouped-query attention to improve inference scalability for larger models." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 172a1344-d48d-443b-8383-677037570c06
**Similarity:** 0.8720929924174893
**Text:** page_label: 1\n", + "file_name: llama2.pdf\n", + "file_path: data/llama2.pdf\n", + "file_type: application/pdf\n", + "file_size: 13661300\n", + "creation_date: 2024-02-17\n", + "last_modified_date: 2024-02-17\n", + "last_accessed_date: 2024-02-17\n", + "context_summary: Llama 2 is a collection of pretrained and fine-tuned large language models optimized for dialogue use cases, ranging from 7 billion to 70 billion parameters. The models, known as Llama 2-Chat, have shown superior performance compared to open-source chat models on various benchmarks and are considered as potential alternatives to closed-source models.\n", + "\n", + "Llama 2 : Open Foundation and Fine-Tuned Chat Models\n", + "Hugo Touvron∗Louis Martin†Kevin Stone†\n", + "Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra\n", + "Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen\n", + "Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller\n", + "Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou\n", + "Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev\n", + "Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich\n", + "Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra\n", + "Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi\n", + "Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang\n", + "Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang\n", + "Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic\n", + "Sergey Edunov Thomas Scialom∗\n", + "GenAI, Meta\n", + "Abstract\n", + "In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned\n", + "large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters.\n", + "Our fine-tuned LLMs, called Llama 2-Chat , are optimized for dialogue use cases. Our\n", + "models outperform open-source chat models on most benchmarks we tested, and based on\n", + "ourhumanevaluationsforhelpfulnessandsafety,maybeasuitablesubstituteforclosed-\n", + "source models. We provide a detailed description of our approach to fine-tuning and safety\n", + "improvements of Llama 2-Chat in order to enable the community to build on our work and\n", + "contribute to the responsible development of LLMs.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** dbbde2a7-d51c-4245-959d-ba97ba414b55
**Similarity:** 0.8700958215249326
**Text:** page_label: 5\n", + "file_name: llama2.pdf\n", + "file_path: data/llama2.pdf\n", + "file_type: application/pdf\n", + "file_size: 13661300\n", + "creation_date: 2024-02-17\n", + "last_modified_date: 2024-02-17\n", + "last_accessed_date: 2024-02-17\n", + "context_summary: Llama 2-Chat is developed through pretraining, supervised fine-tuning, and reinforcement learning with human feedback methodologies, focusing on refining the model iteratively. The training process involves using an optimized auto-regressive transformer, robust data cleaning, updated data mixes, and specific architectural enhancements like increased context length and grouped-query attention.\n", + "\n", + "Figure4: Trainingof Llama 2-Chat : Thisprocessbeginswiththe pretraining ofLlama 2 usingpublicly\n", + "availableonlinesources. Followingthis,wecreateaninitialversionof Llama 2-Chat throughtheapplication\n", + "ofsupervised fine-tuning . Subsequently, the model is iteratively refined using Reinforcement Learning\n", + "with Human Feedback (RLHF) methodologies, specifically through rejection sampling and Proximal Policy\n", + "Optimization(PPO).ThroughouttheRLHFstage,theaccumulationof iterativerewardmodelingdata in\n", + "parallel with model enhancements is crucial to ensure the reward models remain within distribution.\n", + "2 Pretraining\n", + "Tocreatethenewfamilyof Llama 2models,webeganwiththepretrainingapproachdescribedinTouvronetal.\n", + "(2023), using an optimized auto-regressive transformer, but made several changes to improve performance.\n", + "Specifically,weperformedmorerobustdatacleaning,updatedourdatamixes,trainedon40%moretotal\n", + "tokens,doubledthecontextlength,andusedgrouped-queryattention(GQA)toimproveinferencescalability\n", + "for our larger models. Table 1 compares the attributes of the new Llama 2 models with the Llama 1 models.\n", + "2.1 Pretraining Data\n", + "Our training corpus includes a new mix of data from publicly available sources, which does not include data\n", + "fromMeta’sproductsorservices. Wemadeanefforttoremovedatafromcertainsitesknowntocontaina\n", + "highvolumeofpersonalinformationaboutprivateindividuals. Wetrainedon2trilliontokensofdataasthis\n", + "providesagoodperformance–costtrade-off,up-samplingthemostfactualsourcesinanefforttoincrease\n", + "knowledge and dampen hallucinations.\n", + "Weperformedavarietyofpretrainingdatainvestigationssothatuserscanbetterunderstandthepotential\n", + "capabilities and limitations of our models; results can be found in Section 4.1.\n", + "2.2 Training Details\n", + "We adopt most of the pretraining setting and model architecture from Llama 1 .
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython.display import Markdown, display\n", + "from llama_index.core.response.notebook_utils import display_source_node\n", + "\n", + "response = subdoc_summary_pack.run(\"How was Llama2 pretrained?\")\n", + "display(Markdown(str(response)))\n", + "for n in response.source_nodes:\n", + " display_source_node(n, source_length=10000, metadata_mode=\"all\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1181af9d-680f-4ba3-89e2-f88b12a89cc7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "The latest ChatGPT model, equipped with Ghost Attention (GAtt), demonstrates strong multi-turn memory ability by consistently referring to defined attributes for up to 20 turns in a conversation. This integration of GAtt in the ChatGPT model allows for efficient long context attention beyond 2048 tokens, showcasing potential for robust performance in handling extended contexts." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 005a3c23-8d97-4e5d-957e-98ad2dfb93ad
**Similarity:** 0.7923889627946064
**Text:** page_label: 54\n", + "file_name: llama2.pdf\n", + "file_path: data/llama2.pdf\n", + "file_type: application/pdf\n", + "file_size: 13661300\n", + "creation_date: 2024-02-17\n", + "last_modified_date: 2024-02-17\n", + "last_accessed_date: 2024-02-17\n", + "context_summary: Llama 2-Chat with GAtt consistently refers to defined attributes for up to 20 turns, showcasing strong multi-turn memory ability. The integration of GAtt in Llama 2-Chat enables efficient long context attention beyond 2048 tokens, indicating potential for robust performance in handling extended contexts.\n", + "\n", + "Dialogue Turn Baseline + GAtt\n", + "2 100% 100%\n", + "4 10% 100%\n", + "6 0% 100%\n", + "20 0% 100%\n", + "Table30: GAttresults. Llama 2-Chat withGAttisabletorefertoattributes100%ofthetime,forupto20\n", + "turns from our human evaluation. We limited the evaluated attributes to public figures and hobbies.\n", + "Theattentionnowspansbeyond20turns. Wetestedthemodelabilitytorememberthesystemarguments\n", + "troughahumanevaluation. Thearguments(e.g. hobbies,persona)aredefinedduringthefirstmessage,and\n", + "then from turn 2 to 20. We explicitly asked the model to refer to them (e.g. “What is your favorite hobby?”,\n", + "“Whatisyourname?”),tomeasurethemulti-turnmemoryabilityof Llama 2-Chat . Wereporttheresults\n", + "inTable30. EquippedwithGAtt, Llama 2-Chat maintains100%accuracy,alwaysreferringtothedefined\n", + "attribute,andso,upto20turns(wedidnotextendthehumanevaluationmore,andalltheexampleshad\n", + "lessthan4048tokensintotalovertheturns). Asacomparison, Llama 2-Chat withoutGAttcannotanymore\n", + "refer to the attributes after only few turns: from 100% at turn t+1, to 10% at turn t+3 and then 0%.\n", + "GAttZero-shotGeneralisation. Wetriedatinferencetimetosetconstrainnotpresentinthetrainingof\n", + "GAtt. For instance, “answer in one sentence only”, for which the model remained consistent, as illustrated in\n", + "Figure 28.\n", + "We applied first GAtt to Llama 1 , which was pretrained with a context length of 2048 tokens and then\n", + "fine-tuned with 4096 max length. We tested if GAtt works beyond 2048 tokens, and the model arguably\n", + "managed to understand attributes beyond this window. This promising result indicates that GAtt could be\n", + "adapted as an efficient technique for long context attention.\n", + "A.3.6 How Far Can Model-Based Evaluation Go?
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 0b1719e9-d7fa-42af-890b-5eeb946857c5
**Similarity:** 0.7837282816384877
**Text:** page_label: 16\n", + "file_name: llama2.pdf\n", + "file_path: data/llama2.pdf\n", + "file_type: application/pdf\n", + "file_size: 13661300\n", + "creation_date: 2024-02-17\n", + "last_modified_date: 2024-02-17\n", + "last_accessed_date: 2024-02-17\n", + "context_summary: The text discusses the challenges faced in maintaining multi-turn consistency in dialogue systems and introduces a method called Ghost Attention (GAtt) to address these issues. GAtt involves incorporating instructions throughout a conversation to ensure dialogue control over multiple turns.\n", + "\n", + "Figure 9: Issues with multi-turn memory (left)can be improved with GAtt (right).\n", + "We train for between 200and400iterations for all our models, and use evaluations on held-out prompts for\n", + "earlystopping. EachiterationofPPOonthe70Bmodeltakesonaverage ≈330seconds. Totrainquicklywith\n", + "large batch sizes, we use FSDP (Zhao et al., 2023). This was effective when using O(1) forward or backward\n", + "passes,butcausedalargeslowdown( ≈20×)duringgeneration,evenwhenusingalargebatchsizeandKV\n", + "cache. We were able to mitigate this by consolidating the model weights to each node once before generation\n", + "and then freeing the memory after generation, resuming the rest of the training loop.\n", + "3.3 System Message for Multi-Turn Consistency\n", + "In a dialogue setup, some instructions should apply for all the conversation turns, e.g., to respond succinctly,\n", + "or to“act as”some public figure. When we provided such instructions to Llama 2-Chat , the subsequent\n", + "response should always respect the constraint. However, our initial RLHF models tended to forget the initial\n", + "instruction after a few turns of dialogue, as illustrated in Figure 9 (left).\n", + "To address these limitations, we propose Ghost Attention (GAtt), a very simple method inspired by Context\n", + "Distillation (Bai et al., 2022b) that hacks the fine-tuning data to help the attention focus in a multi-stage\n", + "process. GAtt enables dialogue control over multiple turns, as illustrated in Figure 9 (right).\n", + "GAttMethod. Assumewe haveaccess toa multi-turndialoguedataset betweentwo persons(e.g., auser\n", + "and an assistant), with a list of messages [u1, a1, . . . , u n, an], where unandancorrespond to the user and\n", + "assistant messages for turn n, respectively. Then, we define an instruction, inst, that should be respected\n", + "throughout the dialogue. For example, instcould be “act as.” We can then synthetically concatenate this\n", + "instruction to all the user messages of the conversation.\n", + "Next, we can sample from this synthetic data using the latest RLHF model.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython.display import Markdown, display\n", + "\n", + "response = subdoc_summary_pack.run(\n", + " \"What is the functionality of latest ChatGPT memory.\"\n", + ")\n", + "display(Markdown(str(response)))\n", + "\n", + "for n in response.source_nodes:\n", + " display_source_node(n, source_length=10000, metadata_mode=\"all\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llama_index_v3", + "language": "python", + "name": "llama_index_v3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/BUILD b/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/__init__.py b/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/__init__.py new file mode 100644 index 0000000000000..d1e5d547adc6b --- /dev/null +++ b/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/__init__.py @@ -0,0 +1,4 @@ +from llama_index.packs.subdoc_summary.base import SubDocSummaryPack + + +__all__ = ["SubDocSummaryPack"] diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/base.py b/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/base.py new file mode 100644 index 0000000000000..db2df79cadd73 --- /dev/null +++ b/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/base.py @@ -0,0 +1,93 @@ +"""Subdoc Summary.""" + +from typing import Any, Dict, List, Optional, List + +from llama_index.core.llama_pack import BaseLlamaPack +from llama_index.core.schema import Document +from llama_index.core.text_splitter import SentenceSplitter +from llama_index.core.utils import print_text +from llama_index.core import SummaryIndex, VectorStoreIndex +from llama_index.core.embeddings import BaseEmbedding +from llama_index.core.llms import LLM + + +DEFAULT_SUMMARY_PROMPT_STR = """\ +Please give a concise summary of the context in 1-2 sentences. +""" + + +class SubDocSummaryPack(BaseLlamaPack): + """Pack for injecting sub-doc metadata into each chunk.""" + + def __init__( + self, + documents: List[Document], + parent_chunk_size: int = 8192, + parent_chunk_overlap: int = 512, + child_chunk_size: int = 512, + child_chunk_overlap: int = 32, + summary_prompt_str: str = DEFAULT_SUMMARY_PROMPT_STR, + verbose: bool = False, + embed_model: Optional[BaseEmbedding] = None, + llm: Optional[LLM] = None, + ) -> None: + """Init params.""" + self.parent_chunk_size = parent_chunk_size + self.child_chunk_size = child_chunk_size + + self.parent_splitter = SentenceSplitter( + chunk_size=parent_chunk_size, chunk_overlap=parent_chunk_overlap + ) + self.child_splitter = SentenceSplitter( + chunk_size=child_chunk_size, chunk_overlap=child_chunk_overlap + ) + + self.summary_prompt_str = summary_prompt_str + self.embed_model = embed_model + self.llm = llm + + parent_nodes = self.parent_splitter.get_nodes_from_documents(documents) + all_child_nodes = [] + # For each parent node, extract the child nodes and print the text + for idx, parent_node in enumerate(parent_nodes): + if verbose: + print_text( + f"> Processing parent chunk {idx + 1} of {len(parent_nodes)}\n", + color="blue", + ) + # get summary + summary_index = SummaryIndex([parent_node]) + summary_query_engine = summary_index.as_query_engine( + response_mode="tree_summarize" + ) + parent_summary = summary_query_engine.query(DEFAULT_SUMMARY_PROMPT_STR) + if verbose: + print_text(f"Extracted summary: {parent_summary}\n", color="pink") + + # attach summary to all child nodes + child_nodes = self.child_splitter.get_nodes_from_documents([parent_node]) + for child_node in child_nodes: + child_node.metadata["context_summary"] = str(parent_summary) + + all_child_nodes.extend(child_nodes) + + # build vector index for child nodes + self.vector_index = VectorStoreIndex( + all_child_nodes, embed_model=self.embed_model + ) + self.vector_retriever = self.vector_index.as_retriever() + self.vector_query_engine = self.vector_index.as_query_engine(llm=llm) + + self.verbose = verbose + + def get_modules(self) -> Dict[str, Any]: + """Get modules.""" + return { + "vector_index": self.vector_index, + "vector_retriever": self.vector_retriever, + "vector_query_engine": self.vector_query_engine, + } + + def run(self, *args: Any, **kwargs: Any) -> Any: + """Run the pipeline.""" + return self.vector_query_engine.query(*args, **kwargs) diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/pyproject.toml b/llama-index-packs/llama-index-packs-subdoc-summary/pyproject.toml new file mode 100644 index 0000000000000..6b7c25b8691f9 --- /dev/null +++ b/llama-index-packs/llama-index-packs-subdoc-summary/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +# Feel free to un-skip examples, and experimental, you will just need to +# work through many typos (--write-changes and --interactive will help) +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +classes = ["SubDocSummaryPack"] +contains_example = false +import_path = "llama_index.packs.subdoc_summary" + +[tool.mypy] +disallow_untyped_defs = true +# Remove venv skip when integrated with pre-commit +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Your Name "] +description = "llama-index packs subdoc-summary implementation" +license = "MIT" +name = "llama-index-packs-subdoc-summary" +packages = [{include = "llama_index/"}] +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.8.1,<3.12" +llama-index-core = "^0.10.0" + +[tool.poetry.group.dev.dependencies] +black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"} +codespell = {extras = ["toml"], version = ">=v2.2.6"} +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991 +types-setuptools = "67.1.0.0" diff --git a/llama-index-packs/llama-index-packs-subdoc-summary/tests/__init__.py b/llama-index-packs/llama-index-packs-subdoc-summary/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d