diff --git a/.github/workflows/publish_sub_package.yml b/.github/workflows/publish_sub_package.yml index 33aa2b6febb1a..731560e9475b2 100644 --- a/.github/workflows/publish_sub_package.yml +++ b/.github/workflows/publish_sub_package.yml @@ -4,6 +4,7 @@ on: push: branches: - main + env: POETRY_VERSION: "1.6.1" PYTHON_VERSION: "3.10" diff --git a/llama-index-packs/llama-index-packs-raptor/.gitignore b/llama-index-packs/llama-index-packs-raptor/.gitignore new file mode 100644 index 0000000000000..990c18de22908 --- /dev/null +++ b/llama-index-packs/llama-index-packs-raptor/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-packs/llama-index-packs-raptor/BUILD b/llama-index-packs/llama-index-packs-raptor/BUILD new file mode 100644 index 0000000000000..09bd1b6726c8f --- /dev/null +++ b/llama-index-packs/llama-index-packs-raptor/BUILD @@ -0,0 +1,4 @@ +poetry_requirements( + name="poetry", + module_mapping={"umap-learn": ["umap"], "scikit-learn": ["sklearn"]} +) diff --git a/llama-index-packs/llama-index-packs-raptor/Makefile b/llama-index-packs/llama-index-packs-raptor/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-packs/llama-index-packs-raptor/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-packs/llama-index-packs-raptor/README.md b/llama-index-packs/llama-index-packs-raptor/README.md new file mode 100644 index 0000000000000..4b9b07fb5a549 --- /dev/null +++ b/llama-index-packs/llama-index-packs-raptor/README.md @@ -0,0 +1,78 @@ +# Raptor Retriever LlamaPack + +This LlamaPack shows how to use an implementation of RAPTOR with llama-index, leveraging the RAPTOR pack. + +RAPTOR works by recursively clustering and summarizing clusters in layers for retrieval. + +There two retrieval modes: + +- tree_traversal -- traversing the tree of clusters, performing top-k at each level in the tree. +- collapsed -- treat the entire tree as a giant pile of nodes, perform simple top-k. + +See [the paper](https://arxiv.org/abs/2401.18059) for full algorithm details. + +## CLI Usage + +You can download llamapacks directly using `llamaindex-cli`, which comes installed with the `llama-index` python package: + +```bash +llamaindex-cli download-llamapack RaptorPack --download-dir ./raptor_pack +``` + +You can then inspect/modify the files at `./raptor_pack` and use them as a template for your own project. + +## Code Usage + +You can alternaitvely install the package: + +`pip install llama-index-packs-raptor` + +Then, you can import and initialize the pack! This will perform clustering and summarization over your data. + +```python +from llama_index.packs.raptor import RaptorPack + +pack = RaptorPack(documents, llm=llm, embed_model=embed_model) +``` + +The `run()` function is a light wrapper around `retriever.retrieve()`. + +```python +nodes = pack.run( + "query", + mode="collapsed", # or tree_traversal +) +``` + +You can also use modules individually. + +```python +# get the retriever +retriever = pack.retriever +``` + +## Persistence + +The `RaptorPack` comes with the `RaptorRetriever`, which offers ways of saving/reloading! + +If you are using a remote vector-db, just pass it in + +```python +# Pack usage +pack = RaptorPack(..., vector_store=vector_store) + +# RaptorRetriever usage +retriever = RaptorRetriever(..., vector_store=vector_store) +``` + +Then, to re-connect, just pass in the vector store again and an empty list of documents + +```python +# Pack usage +pack = RaptorPack([], ..., vector_store=vector_store) + +# RaptorRetriever usage +retriever = RaptorRetriever([], ..., vector_store=vector_store) +``` + +Check out the [notebook here for complete details!](https://github.com/run-llama/llama_index/blob/main/llama-index-packs/llama-index-packs-raptor/examples/raptor.ipynb). diff --git a/llama-index-packs/llama-index-packs-raptor/examples/raptor.ipynb b/llama-index-packs/llama-index-packs-raptor/examples/raptor.ipynb new file mode 100644 index 0000000000000..1f550dc52375e --- /dev/null +++ b/llama-index-packs/llama-index-packs-raptor/examples/raptor.ipynb @@ -0,0 +1,379 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval\n", + "\n", + "This notebook shows how to use an implementation of RAPTOR with llama-index, leveraging the RAPTOR llama-pack.\n", + "\n", + "RAPTOR works by recursively clustering and summarizing clusters in layers for retrieval.\n", + "\n", + "There two retrieval modes:\n", + "- tree_traversal -- traversing the tree of clusters, performing top-k at each level in the tree.\n", + "- collapsed -- treat the entire tree as a giant pile of nodes, perform simple top-k.\n", + "\n", + "See [the paper](https://arxiv.org/abs/2401.18059) for full algorithm details." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install llama-index llama-index-packs-raptor llama-index-vector-stores-qdrant" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.packs.raptor import RaptorPack\n", + "\n", + "# optionally download the pack to inspect/modify it yourself!\n", + "# from llama_index.core.llama_pack import download_llama_pack\n", + "# RaptorPack = download_llama_pack(\"RaptorPack\", \"./raptor_pack\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Will not apply HSTS. The HSTS database must be a regular and non-world-writable file.\n", + "ERROR: could not open HSTS store at '/home/loganm/.wget-hsts'. HSTS will be disabled.\n", + "--2024-02-29 22:16:11-- https://arxiv.org/pdf/2401.18059.pdf\n", + "Resolving arxiv.org (arxiv.org)... 151.101.3.42, 151.101.195.42, 151.101.131.42, ...\n", + "Connecting to arxiv.org (arxiv.org)|151.101.3.42|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 2547113 (2.4M) [application/pdf]\n", + "Saving to: ‘./raptor_paper.pdf’\n", + "\n", + "./raptor_paper.pdf 100%[===================>] 2.43M 12.5MB/s in 0.2s \n", + "\n", + "2024-02-29 22:16:12 (12.5 MB/s) - ‘./raptor_paper.pdf’ saved [2547113/2547113]\n", + "\n" + ] + } + ], + "source": [ + "!wget https://arxiv.org/pdf/2401.18059.pdf -O ./raptor_paper.pdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Constructing the Clusters/Hierarchy Tree" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core import SimpleDirectoryReader\n", + "\n", + "documents = SimpleDirectoryReader(input_files=[\"./raptor_paper.pdf\"]).load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating embeddings for level 0.\n", + "Performing clustering for level 0.\n", + "Generating summaries for level 0 with 10 clusters.\n", + "Level 0 created summaries/clusters: 10\n", + "Generating embeddings for level 1.\n", + "Performing clustering for level 1.\n", + "Generating summaries for level 1 with 1 clusters.\n", + "Level 1 created summaries/clusters: 1\n", + "Generating embeddings for level 2.\n", + "Performing clustering for level 2.\n", + "Generating summaries for level 2 with 1 clusters.\n", + "Level 2 created summaries/clusters: 1\n" + ] + } + ], + "source": [ + "from llama_index.core.node_parser import SentenceSplitter\n", + "from llama_index.llms.openai import OpenAI\n", + "from llama_index.embeddings.openai import OpenAIEmbedding\n", + "from llama_index.vector_stores.chroma import ChromaVectorStore\n", + "import chromadb\n", + "\n", + "client = chromadb.PersistentClient(path=\"./raptor_paper_db\")\n", + "collection = client.get_or_create_collection(\"raptor\")\n", + "\n", + "vector_store = ChromaVectorStore(chroma_collection=collection)\n", + "\n", + "raptor_pack = RaptorPack(\n", + " documents,\n", + " embed_model=OpenAIEmbedding(\n", + " model=\"text-embedding-3-small\"\n", + " ), # used for embedding clusters\n", + " llm=OpenAI(model=\"gpt-3.5-turbo\", temperature=0.1), # used for generating summaries\n", + " vector_store=vector_store, # used for storage\n", + " similarity_top_k=2, # top k for each layer, or overall top-k for collapsed\n", + " mode=\"collapsed\", # sets default mode\n", + " transformations=[\n", + " SentenceSplitter(chunk_size=400, chunk_overlap=50)\n", + " ], # transformations applied for ingestion\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Retrieval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2\n", + "Specifically, RAPTOR’s F-1 scores are at least 1.8% points higher than DPR and at least 5.3% points\n", + "higher than BM25.\n", + "Retriever GPT-3 F-1 Match GPT-4 F-1 Match UnifiedQA F-1 Match\n", + "Title + Abstract 25.2 22.2 17.5\n", + "BM25 46.6 50.2 26.4\n", + "DPR 51.3 53.0 32.1\n", + "RAPTOR 53.1 55.7 36.6\n", + "Table 4: Comparison of accuracies on the QuAL-\n", + "ITY dev dataset for two different language mod-\n", + "els (GPT-3, UnifiedQA 3B) using various retrieval\n", + "methods. RAPTOR outperforms the baselines of\n", + "BM25 and DPR by at least 2.0% in accuracy.\n", + "Model GPT-3 Acc. UnifiedQA Acc.\n", + "BM25 57.3 49.9\n", + "DPR 60.4 53.9\n", + "RAPTOR 62.4 56.6\n", + "Table 5: Results on F-1 Match scores of various\n", + "models on the QASPER dataset.\n", + "Model F-1 Match\n", + "LongT5 XL (Guo et al., 2022) 53.1\n", + "CoLT5 XL (Ainslie et al., 2023) 53.9\n", + "RAPTOR + GPT-4 55.7Comparison to State-of-the-art Systems\n", + "Building upon our controlled comparisons,\n", + "we examine RAPTOR’s performance relative\n", + "to other state-of-the-art models.\n" + ] + } + ], + "source": [ + "nodes = raptor_pack.run(\"What baselines is raptor compared against?\", mode=\"collapsed\")\n", + "print(len(nodes))\n", + "print(nodes[0].text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Retrieved parent IDs from level 2: ['cc3b3f41-f4ca-4020-b11f-be7e0ce04c4f']\n", + "Retrieved 1 from parents at level 2.\n", + "Retrieved parent IDs from level 1: ['a4ca9426-a312-4a01-813a-c9b02aefc7e8']\n", + "Retrieved 2 from parents at level 1.\n", + "Retrieved parent IDs from level 0: ['63126782-2778-449f-99c0-1e8fd90caa36', 'd8f68d31-d878-41f1-aeb6-a7dde8ed5143']\n", + "Retrieved 4 from parents at level 0.\n", + "4\n", + "Specifically, RAPTOR’s F-1 scores are at least 1.8% points higher than DPR and at least 5.3% points\n", + "higher than BM25.\n", + "Retriever GPT-3 F-1 Match GPT-4 F-1 Match UnifiedQA F-1 Match\n", + "Title + Abstract 25.2 22.2 17.5\n", + "BM25 46.6 50.2 26.4\n", + "DPR 51.3 53.0 32.1\n", + "RAPTOR 53.1 55.7 36.6\n", + "Table 4: Comparison of accuracies on the QuAL-\n", + "ITY dev dataset for two different language mod-\n", + "els (GPT-3, UnifiedQA 3B) using various retrieval\n", + "methods. RAPTOR outperforms the baselines of\n", + "BM25 and DPR by at least 2.0% in accuracy.\n", + "Model GPT-3 Acc. UnifiedQA Acc.\n", + "BM25 57.3 49.9\n", + "DPR 60.4 53.9\n", + "RAPTOR 62.4 56.6\n", + "Table 5: Results on F-1 Match scores of various\n", + "models on the QASPER dataset.\n", + "Model F-1 Match\n", + "LongT5 XL (Guo et al., 2022) 53.1\n", + "CoLT5 XL (Ainslie et al., 2023) 53.9\n", + "RAPTOR + GPT-4 55.7Comparison to State-of-the-art Systems\n", + "Building upon our controlled comparisons,\n", + "we examine RAPTOR’s performance relative\n", + "to other state-of-the-art models.\n" + ] + } + ], + "source": [ + "nodes = raptor_pack.run(\n", + " \"What baselines is raptor compared against?\", mode=\"tree_traversal\"\n", + ")\n", + "print(len(nodes))\n", + "print(nodes[0].text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading\n", + "\n", + "Since we saved to a vector store, we can also use it again! (For local vector stores, there is a `persist` and `from_persist_dir` method on the retriever)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.packs.raptor import RaptorRetriever\n", + "\n", + "retriever = RaptorRetriever(\n", + " [],\n", + " embed_model=OpenAIEmbedding(\n", + " model=\"text-embedding-3-small\"\n", + " ), # used for embedding clusters\n", + " llm=OpenAI(model=\"gpt-3.5-turbo\", temperature=0.1), # used for generating summaries\n", + " vector_store=vector_store, # used for storage\n", + " similarity_top_k=2, # top k for each layer, or overall top-k for collapsed\n", + " mode=\"tree_traversal\", # sets default mode\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# if using a default vector store\n", + "# retriever.persist(\"./persist\")\n", + "# retriever = RaptorRetriever.from_persist_dir(\"./persist\", ...)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Query Engine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core.query_engine import RetrieverQueryEngine\n", + "\n", + "query_engine = RetrieverQueryEngine.from_args(\n", + " raptor_pack.retriever, llm=OpenAI(model=\"gpt-3.5-turbo\", temperature=0.1)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = query_engine.query(\"What baselines was RAPTOR compared against?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BM25 and DPR\n" + ] + } + ], + "source": [ + "print(str(response))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llama-index-4aB9_5sa-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/llama-index-packs/llama-index-packs-raptor/examples/raptor/81db1dbe-a06d-43a6-ba07-875398bc33a7/data_level0.bin b/llama-index-packs/llama-index-packs-raptor/examples/raptor/81db1dbe-a06d-43a6-ba07-875398bc33a7/data_level0.bin new file mode 100644 index 0000000000000..ea3192e8ec511 Binary files /dev/null and b/llama-index-packs/llama-index-packs-raptor/examples/raptor/81db1dbe-a06d-43a6-ba07-875398bc33a7/data_level0.bin differ diff --git a/llama-index-packs/llama-index-packs-raptor/examples/raptor/81db1dbe-a06d-43a6-ba07-875398bc33a7/header.bin b/llama-index-packs/llama-index-packs-raptor/examples/raptor/81db1dbe-a06d-43a6-ba07-875398bc33a7/header.bin new file mode 100644 index 0000000000000..3e0932a7d0033 Binary files /dev/null and b/llama-index-packs/llama-index-packs-raptor/examples/raptor/81db1dbe-a06d-43a6-ba07-875398bc33a7/header.bin differ diff --git a/llama-index-packs/llama-index-packs-raptor/examples/raptor/81db1dbe-a06d-43a6-ba07-875398bc33a7/length.bin b/llama-index-packs/llama-index-packs-raptor/examples/raptor/81db1dbe-a06d-43a6-ba07-875398bc33a7/length.bin new file mode 100644 index 0000000000000..45bfe72ed91d6 Binary files /dev/null and b/llama-index-packs/llama-index-packs-raptor/examples/raptor/81db1dbe-a06d-43a6-ba07-875398bc33a7/length.bin differ diff --git a/llama-index-packs/llama-index-packs-raptor/examples/raptor/81db1dbe-a06d-43a6-ba07-875398bc33a7/link_lists.bin b/llama-index-packs/llama-index-packs-raptor/examples/raptor/81db1dbe-a06d-43a6-ba07-875398bc33a7/link_lists.bin new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-packs/llama-index-packs-raptor/examples/raptor/chroma.sqlite3 b/llama-index-packs/llama-index-packs-raptor/examples/raptor/chroma.sqlite3 new file mode 100644 index 0000000000000..ca77f198baaba Binary files /dev/null and b/llama-index-packs/llama-index-packs-raptor/examples/raptor/chroma.sqlite3 differ diff --git a/llama-index-packs/llama-index-packs-raptor/llama_index/packs/raptor/BUILD b/llama-index-packs/llama-index-packs-raptor/llama_index/packs/raptor/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-packs/llama-index-packs-raptor/llama_index/packs/raptor/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-packs/llama-index-packs-raptor/llama_index/packs/raptor/__init__.py b/llama-index-packs/llama-index-packs-raptor/llama_index/packs/raptor/__init__.py new file mode 100644 index 0000000000000..7bc9f986b42d1 --- /dev/null +++ b/llama-index-packs/llama-index-packs-raptor/llama_index/packs/raptor/__init__.py @@ -0,0 +1,4 @@ +from llama_index.packs.raptor.base import RaptorPack, RaptorRetriever + + +__all__ = ["RaptorPack", "RaptorRetriever"] diff --git a/llama-index-packs/llama-index-packs-raptor/llama_index/packs/raptor/base.py b/llama-index-packs/llama-index-packs-raptor/llama_index/packs/raptor/base.py new file mode 100644 index 0000000000000..89a539ee07aa8 --- /dev/null +++ b/llama-index-packs/llama-index-packs-raptor/llama_index/packs/raptor/base.py @@ -0,0 +1,366 @@ +from typing import Any, Dict, List, Optional + +import asyncio +from enum import Enum + +from llama_index.core import ( + StorageContext, + VectorStoreIndex, + get_response_synthesizer, + load_index_from_storage, +) +from llama_index.core.base.response.schema import Response +from llama_index.core.base.base_retriever import BaseRetriever, QueryType +from llama_index.core.bridge.pydantic import BaseModel, Field +from llama_index.core.embeddings import BaseEmbedding +from llama_index.core.ingestion import run_transformations +from llama_index.core.llama_pack.base import BaseLlamaPack +from llama_index.core.llms.llm import LLM +from llama_index.core.response_synthesizers import BaseSynthesizer +from llama_index.core.schema import ( + BaseNode, + NodeWithScore, + QueryBundle, + TextNode, + TransformComponent, +) +from llama_index.core.vector_stores.types import ( + MetadataFilter, + MetadataFilters, + VectorStore, +) +from llama_index.packs.raptor.clustering import get_clusters + + +DEFAULT_SUMMARY_PROMPT = ( + "Summarize the provided text, including as many key details as needed." +) + + +class QueryModes(str, Enum): + """Query modes.""" + + tree_traversal = "tree_traversal" + collapsed = "collapsed" + + +class SummaryModule(BaseModel): + response_synthesizer: BaseSynthesizer = Field(description="LLM") + summary_prompt: str = Field( + default=DEFAULT_SUMMARY_PROMPT, + description="Summary prompt.", + ) + num_workers: int = Field( + default=4, description="Number of workers to generate summaries." + ) + show_progress: bool = Field(default=True, description="Show progress.") + + class Config: + arbitrary_types_allowed = True + + def __init__( + self, llm: Optional[LLM] = None, summary_prompt: str = DEFAULT_SUMMARY_PROMPT + ) -> None: + response_synthesizer = get_response_synthesizer( + response_mode="tree_summarize", use_async=True, llm=llm + ) + super().__init__( + response_synthesizer=response_synthesizer, summary_prompt=summary_prompt + ) + + async def generate_summaries( + self, documents_per_cluster: List[List[BaseNode]] + ) -> List[str]: + """Generate summaries of documents per cluster. + + Args: + documents_per_cluster (List[List[BaseNode]]): List of documents per cluster + + Returns: + List[str]: List of summary for each cluster + """ + jobs = [] + for documents in documents_per_cluster: + with_scores = [NodeWithScore(node=doc, score=1.0) for doc in documents] + jobs.append( + self.response_synthesizer.asynthesize(self.summary_prompt, with_scores) + ) + + lock = asyncio.Semaphore(self.num_workers) + responses = [] + + # run the jobs while limiting the number of concurrent jobs to num_workers + for job in jobs: + async with lock: + responses.append(await job) + + return [str(response) for response in responses] + + +class RaptorRetriever(BaseRetriever): + """Raptor indexing retriever.""" + + def __init__( + self, + documents: List[BaseNode], + tree_depth: int = 3, + similarity_top_k: int = 2, + llm: Optional[LLM] = None, + embed_model: Optional[BaseEmbedding] = None, + vector_store: Optional[VectorStore] = None, + transformations: Optional[List[TransformComponent]] = None, + summary_module: Optional[SummaryModule] = None, + existing_index: Optional[VectorStoreIndex] = None, + mode: QueryModes = "collapsed", + **kwargs: Any, + ) -> None: + """Init params.""" + super().__init__( + **kwargs, + ) + + self.mode = mode + self.summary_module = summary_module or SummaryModule(llm=llm) + self.index = existing_index or VectorStoreIndex( + nodes=[], + storage_context=StorageContext.from_defaults(vector_store=vector_store), + embed_model=embed_model, + transformations=transformations, + ) + self.tree_depth = tree_depth + self.similarity_top_k = similarity_top_k + + if len(documents) > 0: + asyncio.run(self.insert(documents)) + + def _get_embeddings_per_level(self, level: int = 0) -> List[float]: + """Retrieve embeddings per level in the abstraction tree. + + Args: + level (int, optional): Target level. Defaults to 0 which stands for leaf nodes. + + Returns: + List[float]: List of embeddings + """ + filters = MetadataFilters(filters=[MetadataFilter("level", level)]) + + # kind of janky, but should work with any vector index + source_nodes = self.index.as_retriever( + similarity_top_k=10000, filters=filters + ).retrieve("retrieve") + + return [x.node for x in source_nodes] + + async def insert(self, documents: List[BaseNode]) -> None: + """Given a set of documents, this function inserts higher level of abstractions within the index. + + For later retrieval + + Args: + documents (List[BaseNode]): List of Documents + """ + embed_model = self.index._embed_model + transformations = self.index._transformations + + cur_nodes = run_transformations(documents, transformations, in_place=False) + for level in range(self.tree_depth): + # get the embeddings for the current documents + + if self._verbose: + print(f"Generating embeddings for level {level}.") + + embeddings = await embed_model.aget_text_embedding_batch( + [node.get_content(metadata_mode="embed") for node in cur_nodes] + ) + assert len(embeddings) == len(cur_nodes) + id_to_embedding = { + node.id_: embedding for node, embedding in zip(cur_nodes, embeddings) + } + + if self._verbose: + print(f"Performing clustering for level {level}.") + + # cluster the documents + nodes_per_cluster = get_clusters(cur_nodes, id_to_embedding) + + if self._verbose: + print( + f"Generating summaries for level {level} with {len(nodes_per_cluster)} clusters." + ) + summaries_per_cluster = await self.summary_module.generate_summaries( + nodes_per_cluster + ) + + if self._verbose: + print( + f"Level {level} created summaries/clusters: {len(nodes_per_cluster)}" + ) + + # replace the current nodes with their summaries + new_nodes = [ + TextNode( + text=summary, + metadata={"level": level}, + excluded_embed_metadata_keys=["level"], + excluded_llm_metadata_keys=["level"], + ) + for summary in summaries_per_cluster + ] + + # insert the nodes with their embeddings and parent_id + nodes_with_embeddings = [] + for cluster, summary_doc in zip(nodes_per_cluster, new_nodes): + for node in cluster: + node.metadata["parent_id"] = summary_doc.id_ + node.excluded_embed_metadata_keys.append("parent_id") + node.excluded_llm_metadata_keys.append("parent_id") + node.embedding = id_to_embedding[node.id_] + nodes_with_embeddings.append(node) + + self.index.insert_nodes(nodes_with_embeddings) + + # set the current nodes to the new nodes + cur_nodes = new_nodes + + self.index.insert_nodes(cur_nodes) + + async def collapsed_retrieval(self, query_str: str) -> Response: + """Query the index as a collapsed tree -- i.e. a single pool of nodes.""" + return await self.index.as_retriever( + similarity_top_k=self.similarity_top_k + ).aretrieve(query_str) + + async def tree_traversal_retrieval(self, query_str: str) -> Response: + """Query the index as a tree, traversing the tree from the top down.""" + # get top k nodes for each level, starting with the top + parent_ids = None + nodes = [] + level = self.tree_depth - 1 + while level >= 0: + # retrieve nodes at the current level + if parent_ids is None: + nodes = await self.index.as_retriever( + similarity_top_k=self.similarity_top_k, + filters=MetadataFilters( + filters=[MetadataFilter(key="level", value=level)] + ), + ).aretrieve(query_str) + + parent_ids = [node.id_ for node in nodes] + if self._verbose: + print(f"Retrieved parent IDs from level {level}: {parent_ids!s}") + # retrieve nodes that are children of the nodes at the previous level + elif parent_ids is not None and len(parent_ids) > 0: + nested_nodes = await asyncio.gather( + *[ + self.index.as_retriever( + similarity_top_k=self.similarity_top_k, + filters=MetadataFilters( + filters=[MetadataFilter(key="parent_id", value=id_)] + ), + ).aretrieve(query_str) + for id_ in parent_ids + ] + ) + + nodes = [node for nested in nested_nodes for node in nested] + + if self._verbose: + print(f"Retrieved {len(nodes)} from parents at level {level}.") + + level -= 1 + parent_ids = None + + return nodes + + def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]: + """Retrieve nodes given query and mode.""" + # not used, needed for type checking + + def retrieve( + self, query_str_or_bundle: QueryType, mode: Optional[QueryModes] = None + ) -> List[NodeWithScore]: + """Retrieve nodes given query and mode.""" + if isinstance(query_str_or_bundle, QueryBundle): + query_str = query_str_or_bundle.query_str + else: + query_str = query_str_or_bundle + + return asyncio.run(self.aretrieve(query_str, mode or self.mode)) + + async def aretrieve( + self, query_str_or_bundle: QueryType, mode: Optional[QueryModes] = None + ) -> List[NodeWithScore]: + """Retrieve nodes given query and mode.""" + if isinstance(query_str_or_bundle, QueryBundle): + query_str = query_str_or_bundle.query_str + else: + query_str = query_str_or_bundle + + mode = mode or self.mode + if mode == "tree_traversal": + return await self.tree_traversal_retrieval(query_str) + elif mode == "collapsed": + return await self.collapsed_retrieval(query_str) + else: + raise ValueError(f"Invalid mode: {mode}") + + def persist(self, persist_dir: str) -> None: + self.index.storage_context.persist(persist_dir=persist_dir) + + @classmethod + def from_persist_dir( + cls: "RaptorRetriever", + persist_dir: str, + embed_model: Optional[BaseEmbedding] = None, + **kwargs: Any, + ) -> "RaptorRetriever": + storage_context = StorageContext.from_defaults(persist_dir=persist_dir) + return cls( + [], + existing_index=load_index_from_storage( + storage_context, embed_model=embed_model + ), + **kwargs, + ) + + +class RaptorPack(BaseLlamaPack): + """Raptor pack.""" + + def __init__( + self, + documents: List[BaseNode], + llm: Optional[LLM] = None, + embed_model: Optional[BaseEmbedding] = None, + vector_store: Optional[VectorStore] = None, + similarity_top_k: int = 2, + mode: QueryModes = "collapsed", + verbose: bool = True, + **kwargs: Any, + ) -> None: + """Init params.""" + self.retriever = RaptorRetriever( + documents, + embed_model=embed_model, + llm=llm, + similarity_top_k=similarity_top_k, + vector_store=vector_store, + mode=mode, + verbose=verbose, + **kwargs, + ) + + def get_modules(self) -> Dict[str, Any]: + """Get modules.""" + return { + "retriever": self.retriever, + } + + def run( + self, + query: str, + mode: Optional[QueryModes] = None, + ) -> Any: + """Run the pipeline.""" + return self.retriever.retrieve(query, mode=mode) diff --git a/llama-index-packs/llama-index-packs-raptor/llama_index/packs/raptor/clustering.py b/llama-index-packs/llama-index-packs-raptor/llama_index/packs/raptor/clustering.py new file mode 100644 index 0000000000000..3969446f53ac0 --- /dev/null +++ b/llama-index-packs/llama-index-packs-raptor/llama_index/packs/raptor/clustering.py @@ -0,0 +1,169 @@ +""" +Minorly tweaked from https://github.com/parthsarthi03/raptor/blob/master/raptor/cluster_tree_builder.py. + +Full credits to the original authors! +""" + +import numpy as np +import random +import tiktoken +import umap +from sklearn.mixture import GaussianMixture +from typing import Dict, List, Optional + +from llama_index.core.schema import BaseNode + + +# Set a random seed for reproducibility +RANDOM_SEED = 224 +random.seed(RANDOM_SEED) + + +def global_cluster_embeddings( + embeddings: np.ndarray, + dim: int, + n_neighbors: Optional[int] = None, + metric: str = "cosine", +) -> np.ndarray: + if n_neighbors is None: + n_neighbors = int((len(embeddings) - 1) ** 0.5) + return umap.UMAP( + n_neighbors=n_neighbors, n_components=dim, metric=metric + ).fit_transform(embeddings) + + +def local_cluster_embeddings( + embeddings: np.ndarray, dim: int, num_neighbors: int = 10, metric: str = "cosine" +) -> np.ndarray: + return umap.UMAP( + n_neighbors=num_neighbors, n_components=dim, metric=metric + ).fit_transform(embeddings) + + +def get_optimal_clusters( + embeddings: np.ndarray, max_clusters: int = 50, random_state: int = RANDOM_SEED +) -> int: + max_clusters = min(max_clusters, len(embeddings)) + n_clusters = np.arange(1, max_clusters) + bics = [] + for n in n_clusters: + gm = GaussianMixture(n_components=n, random_state=random_state) + gm.fit(embeddings) + bics.append(gm.bic(embeddings)) + return n_clusters[np.argmin(bics)] + + +def GMM_cluster(embeddings: np.ndarray, threshold: float, random_state: int = 0): + n_clusters = get_optimal_clusters(embeddings) + gm = GaussianMixture(n_components=n_clusters, random_state=random_state) + gm.fit(embeddings) + probs = gm.predict_proba(embeddings) + labels = [np.where(prob > threshold)[0] for prob in probs] + return labels, n_clusters + + +def perform_clustering( + embeddings: np.ndarray, + dim: int, + threshold: float, +) -> List[np.ndarray]: + # If the number of embeddings is less than or equal to the dimension, return a list of zeros + # This means all nodes are in the same cluster. + # Otherwise, we will get an error when trying to cluster. + if len(embeddings) <= dim + 1: + return [np.array([0]) for _ in range(len(embeddings))] + + reduced_embeddings_global = global_cluster_embeddings(embeddings, dim) + global_clusters, n_global_clusters = GMM_cluster( + reduced_embeddings_global, threshold + ) + + all_local_clusters = [np.array([]) for _ in range(len(embeddings))] + total_clusters = 0 + + for i in range(n_global_clusters): + global_cluster_embeddings_ = embeddings[ + np.array([i in gc for gc in global_clusters]) + ] + + if len(global_cluster_embeddings_) == 0: + continue + if len(global_cluster_embeddings_) <= dim + 1: + local_clusters = [np.array([0]) for _ in global_cluster_embeddings_] + n_local_clusters = 1 + else: + reduced_embeddings_local = local_cluster_embeddings( + global_cluster_embeddings_, dim + ) + local_clusters, n_local_clusters = GMM_cluster( + reduced_embeddings_local, threshold + ) + + for j in range(n_local_clusters): + local_cluster_embeddings_ = global_cluster_embeddings_[ + np.array([j in lc for lc in local_clusters]) + ] + indices = np.where( + (embeddings == local_cluster_embeddings_[:, None]).all(-1) + )[1] + for idx in indices: + all_local_clusters[idx] = np.append( + all_local_clusters[idx], j + total_clusters + ) + + total_clusters += n_local_clusters + + return all_local_clusters + + +def get_clusters( + nodes: List[BaseNode], + embedding_map: Dict[str, List[List[float]]], + max_length_in_cluster: int = 10000, # 10k tokens max per cluster + tokenizer: tiktoken.Encoding = tiktoken.get_encoding("cl100k_base"), + reduction_dimension: int = 10, + threshold: float = 0.1, +) -> List[List[BaseNode]]: + # get embeddings + embeddings = np.array([np.array(embedding_map[node.id_]) for node in nodes]) + + # Perform the clustering + clusters = perform_clustering( + embeddings, dim=reduction_dimension, threshold=threshold + ) + + # Initialize an empty list to store the clusters of nodes + node_clusters = [] + + # Iterate over each unique label in the clusters + for label in np.unique(np.concatenate(clusters)): + # Get the indices of the nodes that belong to this cluster + indices = [i for i, cluster in enumerate(clusters) if label in cluster] + + # Add the corresponding nodes to the node_clusters list + cluster_nodes = [nodes[i] for i in indices] + + # Base case: if the cluster only has one node, do not attempt to recluster it + if len(cluster_nodes) == 1: + node_clusters.append(cluster_nodes) + continue + + # Calculate the total length of the text in the nodes + total_length = sum([len(tokenizer.encode(node.text)) for node in cluster_nodes]) + + # If the total length exceeds the maximum allowed length, recluster this cluster + if total_length > max_length_in_cluster: + node_clusters.extend( + get_clusters( + cluster_nodes, + embedding_map, + max_length_in_cluster=max_length_in_cluster, + tokenizer=tokenizer, + reduction_dimension=reduction_dimension, + threshold=threshold, + ) + ) + else: + node_clusters.append(cluster_nodes) + + return node_clusters diff --git a/llama-index-packs/llama-index-packs-raptor/pyproject.toml b/llama-index-packs/llama-index-packs-raptor/pyproject.toml new file mode 100644 index 0000000000000..4e0202df446b8 --- /dev/null +++ b/llama-index-packs/llama-index-packs-raptor/pyproject.toml @@ -0,0 +1,60 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +# Feel free to un-skip examples, and experimental, you will just need to +# work through many typos (--write-changes and --interactive will help) +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = true +import_path = "llama_index.packs.raptor" + +[tool.llamahub.class_authors] +RaptorPack = "logan-markewich" + +[tool.mypy] +disallow_untyped_defs = true +# Remove venv skip when integrated with pre-commit +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Logan Markewich "] +description = "llama-index packs raptor integration" +keywords = ["cluster", "raptor", "retrieval"] +license = "MIT" +name = "llama-index-packs-raptor" +packages = [{include = "llama_index/"}] +readme = "README.md" +version = "0.1.1" + +[tool.poetry.dependencies] +python = ">=3.9,<4.0" +llama-index-core = "^0.10.0" +llama-index-llms-openai = "^0.1.6" +umap-learn = ">=0.5.5" +scikit-learn = "*" + +[tool.poetry.group.dev.dependencies] +black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"} +codespell = {extras = ["toml"], version = ">=v2.2.6"} +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991 +types-setuptools = "67.1.0.0" diff --git a/llama-index-packs/llama-index-packs-raptor/tests/BUILD b/llama-index-packs/llama-index-packs-raptor/tests/BUILD new file mode 100644 index 0000000000000..dabf212d7e716 --- /dev/null +++ b/llama-index-packs/llama-index-packs-raptor/tests/BUILD @@ -0,0 +1 @@ +python_tests() diff --git a/llama-index-packs/llama-index-packs-raptor/tests/__init__.py b/llama-index-packs/llama-index-packs-raptor/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-packs/llama-index-packs-raptor/tests/test_packs_raptor.py b/llama-index-packs/llama-index-packs-raptor/tests/test_packs_raptor.py new file mode 100644 index 0000000000000..a8deb2a261a69 --- /dev/null +++ b/llama-index-packs/llama-index-packs-raptor/tests/test_packs_raptor.py @@ -0,0 +1,30 @@ +from llama_index.core import Document, MockEmbedding +from llama_index.core.llms import MockLLM +from llama_index.packs.raptor.base import RaptorRetriever + + +def test_raptor() -> None: + retriever = RaptorRetriever( + [ + Document(text="one"), + Document(text="two"), + Document(text="three"), + Document(text="four"), + Document(text="five"), + Document(text="six"), + Document(text="seven"), + Document(text="eight"), + Document(text="nine"), + Document(text="ten"), + ], + embed_model=MockEmbedding(embed_dim=1536), + llm=MockLLM(), + ) + + assert len(retriever.index.docstore.docs) == 13 + + nodes = retriever.retrieve("test", mode="collapsed") + assert len(nodes) == 2 + + nodes = retriever.retrieve("text", mode="tree_traversal") + assert len(nodes) == 2