diff --git a/README.md b/README.md index 2acd985..66a9da4 100644 --- a/README.md +++ b/README.md @@ -105,9 +105,9 @@ chunks = list(HierarchicalChunker().chunk(doc)) ## More examples Check out the [examples](examples) — showcasing different variants of RAG incl. vector ingestion & retrieval: -- [[LlamaIndex] Milvus dense-embedding RAG](examples/basic_pipeline.ipynb) -- [[LlamaIndex] Milvus hybrid RAG, combining dense & sparse embeddings](examples/hybrid_pipeline.ipynb) -- [[LlamaIndex] Milvus RAG, also fetching native document metadata for search results](examples/native_nodes.ipynb) +- [[LlamaIndex] Milvus basic RAG (dense embeddings)](examples/basic_pipeline.ipynb) +- [[LlamaIndex] Milvus hybrid RAG (dense & sparse embeddings combined e.g. via RRF) & reranker model usage](examples/hybrid_pipeline.ipynb) +- [[LlamaIndex] Milvus RAG also fetching native document metadata for search results](examples/native_nodes.ipynb) - [[LlamaIndex] Local node transformations (e.g. embeddings)](examples/node_transformations.ipynb) - ... diff --git a/examples/hybrid_pipeline.ipynb b/examples/hybrid_pipeline.ipynb index d7eae25..14f22f2 100644 --- a/examples/hybrid_pipeline.ipynb +++ b/examples/hybrid_pipeline.ipynb @@ -35,7 +35,8 @@ " llama-index-embeddings-huggingface \\\n", " llama-index-llms-huggingface-api \\\n", " flagembedding \\\n", - " llama-index-vector-stores-milvus" + " llama-index-vector-stores-milvus \\\n", + " llama-index-postprocessor-flag-embedding-reranker" ] }, { @@ -57,8 +58,8 @@ " \"https://arxiv.org/pdf/2206.01062\", # URL (DocLayNet paper)\n", "]\n", "TEXT_QA_TEMPLATE_STR = \"Context information is below.\\n---------------------\\n{context_str}\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: {query_str}\\nAnswer:\\n\"\n", - "QUERY = \"How many pages were human annotated?\"\n", - "TOP_K = 3\n", + "QUERY = \"What is this paper about?\"\n", + "TOP_K = 5\n", "\n", "INGEST = TypeAdapter(bool).validate_python(os.environ.get(\"INGEST\", \"True\"))" ] @@ -133,11 +134,20 @@ "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/pva/work/github.com/DS4SD/quackling/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + } + ], "source": [ "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", "\n", - "HF_EMBED_MODEL_ID = \"BAAI/bge-small-en-v1.5\"\n", + "HF_EMBED_MODEL_ID = \"BAAI/bge-m3\"\n", "\n", "embed_model = HuggingFaceEmbedding(model_name=HF_EMBED_MODEL_ID)" ] @@ -153,7 +163,18 @@ "cell_type": "code", "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/pva/work/github.com/DS4SD/quackling/.venv/lib/python3.12/site-packages/pydantic/_internal/_fields.py:160: UserWarning: Field \"model_name\" has conflict with protected namespace \"model_\".\n", + "\n", + "You may be able to resolve this warning by setting `model_config['protected_namespaces'] = ()`.\n", + " warnings.warn(\n" + ] + } + ], "source": [ "from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI\n", "\n", @@ -177,50 +198,11 @@ "cell_type": "code", "execution_count": 8, "metadata": {}, - "outputs": [], - "source": [ - "from FlagEmbedding import BGEM3FlagModel\n", - "from llama_index.vector_stores.milvus.utils import BaseSparseEmbeddingFunction\n", - "\n", - "\n", - "class BGEM3SparseEmbedding(BaseSparseEmbeddingFunction):\n", - " def __init__(self):\n", - " self.model = BGEM3FlagModel(\"BAAI/bge-m3\", use_fp16=False)\n", - "\n", - " def encode_queries(self, queries: list[str]):\n", - " outputs = self.model.encode(\n", - " queries,\n", - " return_dense=False,\n", - " return_sparse=True,\n", - " return_colbert_vecs=False,\n", - " )[\"lexical_weights\"]\n", - " return [self._to_standard_dict(output) for output in outputs]\n", - "\n", - " def encode_documents(self, documents: list[str]):\n", - " outputs = self.model.encode(\n", - " documents,\n", - " return_dense=False,\n", - " return_sparse=True,\n", - " return_colbert_vecs=False,\n", - " )[\"lexical_weights\"]\n", - " return [self._to_standard_dict(output) for output in outputs]\n", - "\n", - " def _to_standard_dict(self, raw_output):\n", - " result = {}\n", - " for k in raw_output:\n", - " result[int(k)] = raw_output[k]\n", - " return result" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6f675cd641c64c7db792b1e0bacd5d79", + "model_id": "e15e5c85c31c454a87c2436e957abc4d", "version_major": 2, "version_minor": 0 }, @@ -234,6 +216,7 @@ ], "source": [ "from llama_index.vector_stores.milvus import MilvusVectorStore\n", + "from llama_index.vector_stores.milvus.utils import BGEM3SparseEmbeddingFunction\n", "\n", "MILVUS_URL = os.environ[\"MILVUS_URL\"]\n", "MILVUS_COLL_NAME = os.environ.get(\"MILVUS_COLL_NAME\", \"quackling_hybrid_pipeline\")\n", @@ -243,7 +226,7 @@ " os.environ.get(\"MILVUS_HYBRID_RNKR_PARAMS\", '{\"k\": 60}')\n", ")\n", "\n", - "sparse_embedding = BGEM3SparseEmbedding()\n", + "sparse_embedding = BGEM3SparseEmbeddingFunction()\n", "\n", "vector_store = MilvusVectorStore(\n", " uri=MILVUS_URL,\n", @@ -260,7 +243,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -269,6 +252,24 @@ "vector_store_query_mode = VectorStoreQueryMode.HYBRID" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reranker" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker\n", + "\n", + "reranker = FlagEmbeddingReranker(model=\"BAAI/bge-reranker-v2-m3\", top_n=TOP_K)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -284,7 +285,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7b80ae5ee072444593b8393fb2e0cbab", + "model_id": "150032984d964ed0bf5df6e2ed0412f6", "version_major": 2, "version_minor": 0 }, @@ -306,7 +307,7 @@ "│ │ excluded_embed_metadata_keys=['dl_doc_hash'],\n", "│ │ excluded_llm_metadata_keys=['dl_doc_hash'],\n", "│ │ relationships={},\n", - "│ │ text='{\"_name\":\"\",\"type\":\"pdf-document\",\"description\":{\"title\":null,\"abstract\":null,\"authors\":null,\"affiliations\":null,\"subjects\":null,\"keywords\":null,\"publication_date\":null,\"languages\":null,\"license\":null,\"publishers\":null,\"url_refs\":null,\"references\":nu'+102165,\n", + "│ │ text='{\"_name\":\"\",\"type\":\"pdf-document\",\"description\":{\"title\":null,\"abstract\":null,\"authors\":null,\"affiliations\":null,\"subjects\":null,\"keywords\":null,\"publication_date\":null,\"languages\":null,\"license\":null,\"publishers\":null,\"url_refs\":null,\"references\":nu'+171180,\n", "│ │ mimetype='text/plain',\n", "│ │ start_char_idx=None,\n", "│ │ end_char_idx=None,\n", @@ -326,7 +327,7 @@ "\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'dl_doc_hash'\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m,\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"_name\":\"\",\"type\":\"pdf-document\",\"description\":\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"title\":null,\"abstract\":null,\"authors\":null,\"affiliations\":null,\"subjects\":null,\"keywords\":null,\"publication_date\":null,\"languages\":null,\"license\":null,\"publishers\":null,\"url_refs\":null,\"references\":nu'\u001b[0m+\u001b[1;36m102165\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"_name\":\"\",\"type\":\"pdf-document\",\"description\":\u001b[0m\u001b[32m{\u001b[0m\u001b[32m\"title\":null,\"abstract\":null,\"authors\":null,\"affiliations\":null,\"subjects\":null,\"keywords\":null,\"publication_date\":null,\"languages\":null,\"license\":null,\"publishers\":null,\"url_refs\":null,\"references\":nu'\u001b[0m+\u001b[1;36m171180\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", @@ -369,6 +370,13 @@ "## RAG" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use the reranker in RAG, we include it in the `node_postprocessors`:" + ] + }, { "cell_type": "code", "execution_count": 12, @@ -378,17 +386,17 @@ "data": { "text/html": [ "
Response(\n", - "│ response='80K pages were human annotated.',\n", + "│ response='This paper presents the DocLayNet dataset, which is a new and challenging dataset for the document conversion and layout analysis research community. The dataset was created by human annotation to obtain reliable layout ground-truth on a wide variety'+313,\n", "│ source_nodes=[\n", "│ │ NodeWithScore(\n", "│ │ │ node=TextNode(\n", - "│ │ │ │ id_='756357a5-6b05-4e0d-84b8-25eac5f453f2',\n", + "│ │ │ │ id_='41f13be2-80d4-4480-a61b-4f9ce6d850cc',\n", "│ │ │ │ embedding=None,\n", "│ │ │ │ metadata={...},\n", "│ │ │ │ excluded_embed_metadata_keys=[...],\n", "│ │ │ │ excluded_llm_metadata_keys=[...],\n", "│ │ │ │ relationships={...},\n", - "│ │ │ │ text='3 THE DOCLAYNET DATASET\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of'+303,\n", + "│ │ │ │ text='6 CONCLUSION\\nIn this paper, we presented the DocLayNet dataset. It provides the document conversion and layout analysis research community a new and challenging dataset to improve and fine-tune novel ML methods on. In contrast to many other datasets,'+270,\n", "│ │ │ │ mimetype='text/plain',\n", "│ │ │ │ start_char_idx=None,\n", "│ │ │ │ end_char_idx=None,\n", @@ -396,27 +404,27 @@ "│ │ │ │ metadata_template='{key}: {value}',\n", "│ │ │ │ metadata_seperator='\\n'\n", "│ │ │ ),\n", - "│ │ │ score=0.032522473484277725\n", + "│ │ │ score=-1.5540306568145752\n", "│ │ ),\n", - "│ │ ... +2\n", + "│ │ ... +4\n", "│ ],\n", - "│ metadata={'756357a5-6b05-4e0d-84b8-25eac5f453f2': {'path': '$.main-text[36]', ... +1}, ... +2}\n", + "│ metadata={'41f13be2-80d4-4480-a61b-4f9ce6d850cc': {'path': '$.main-text[115]'}, ... +4}\n", ")\n", "\n" ], "text/plain": [ "\u001b[1;35mResponse\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[33mresponse\u001b[0m=\u001b[32m'80K pages were human annotated.'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[33mresponse\u001b[0m=\u001b[32m'This paper presents the DocLayNet dataset, which is a new and challenging dataset for the document conversion and layout analysis research community. The dataset was created by human annotation to obtain reliable layout ground-truth on a wide variety'\u001b[0m+\u001b[1;36m313\u001b[0m,\n", "\u001b[2;32m│ \u001b[0m\u001b[33msource_nodes\u001b[0m=\u001b[1m[\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1m(\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mnode\u001b[0m=\u001b[1;35mTextNode\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'756357a5-6b05-4e0d-84b8-25eac5f453f2'\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'41f13be2-80d4-4480-a61b-4f9ce6d850cc'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[33m...\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[33m...\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[33m...\u001b[0m\u001b[1m}\u001b[0m,\n", - "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'3 THE DOCLAYNET DATASET\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of'\u001b[0m+\u001b[1;36m303\u001b[0m,\n", + "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'6 CONCLUSION\\nIn this paper, we presented the DocLayNet dataset. It provides the document conversion and layout analysis research community a new and challenging dataset to improve and fine-tune novel ML methods on. In contrast to many other datasets,'\u001b[0m+\u001b[1;36m270\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", @@ -424,11 +432,11 @@ "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ │ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.032522473484277725\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore\u001b[0m=\u001b[1;36m-1.5540306568145752\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m2\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m4\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'756357a5-6b05-4e0d-84b8-25eac5f453f2'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m36\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m}\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m2\u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'41f13be2-80d4-4480-a61b-4f9ce6d850cc'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m115\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m\u001b[1m}\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m4\u001b[0m\u001b[1m}\u001b[0m\n", "\u001b[1m)\u001b[0m\n" ] }, @@ -442,6 +450,7 @@ "query_engine = index.as_query_engine(\n", " llm=llm,\n", " similarity_top_k=TOP_K,\n", + " node_postprocessors=[reranker], # <==\n", " text_qa_template=PromptTemplate(TEXT_QA_TEMPLATE_STR),\n", " vector_store_query_mode=vector_store_query_mode,\n", ")\n", @@ -467,13 +476,13 @@ "
[\n", "│ NodeWithScore(\n", "│ │ node=TextNode(\n", - "│ │ │ id_='756357a5-6b05-4e0d-84b8-25eac5f453f2',\n", + "│ │ │ id_='a2b5a711-7159-460a-8460-ddd080b4db77',\n", "│ │ │ embedding=None,\n", - "│ │ │ metadata={'path': '$.main-text[36]', ... +1},\n", + "│ │ │ metadata={'path': '$.main-text[88]'},\n", "│ │ │ excluded_embed_metadata_keys=['path'],\n", "│ │ │ excluded_llm_metadata_keys=['path'],\n", "│ │ │ relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(...), ... +2},\n", - "│ │ │ text='3 THE DOCLAYNET DATASET\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of'+303,\n", + "│ │ │ text='5 EXPERIMENTS\\npaper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.',\n", "│ │ │ mimetype='text/plain',\n", "│ │ │ start_char_idx=None,\n", "│ │ │ end_char_idx=None,\n", @@ -481,9 +490,9 @@ "│ │ │ metadata_template='{key}: {value}',\n", "│ │ │ metadata_seperator='\\n'\n", "│ │ ),\n", - "│ │ score=0.032522473484277725\n", + "│ │ score=0.032786883413791656\n", "│ ),\n", - "│ ... +2\n", + "│ ... +4\n", "]\n", "\n" ], @@ -491,13 +500,13 @@ "\u001b[1m[\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[1;35mNodeWithScore\u001b[0m\u001b[1m(\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[33mnode\u001b[0m=\u001b[1;35mTextNode\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'756357a5-6b05-4e0d-84b8-25eac5f453f2'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mid_\u001b[0m=\u001b[32m'a2b5a711-7159-460a-8460-ddd080b4db77'\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33membedding\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m36\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m1\u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m88\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m\u001b[1m}\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mexcluded_embed_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'path'\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mexcluded_llm_metadata_keys\u001b[0m=\u001b[1m[\u001b[0m\u001b[32m'path'\u001b[0m\u001b[1m]\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mrelationships\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m<\u001b[0m\u001b[1;95mNodeRelationship.SOURCE:\u001b[0m\u001b[39m \u001b[0m\u001b[32m'1'\u001b[0m\u001b[1m>\u001b[0m: \u001b[1;35mRelatedNodeInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[33m...\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m...\u001b[0m +\u001b[1;36m2\u001b[0m\u001b[1m}\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'3 THE DOCLAYNET DATASET\\nDocLayNet contains 80863 PDF pages. Among these, 7059 carry two instances of human annotations, and 1591 carry three. This amounts to 91104 total annotation instances. The annotations provide layout information in the shape of'\u001b[0m+\u001b[1;36m303\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'5 EXPERIMENTS\\npaper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.'\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mmimetype\u001b[0m=\u001b[32m'text/plain'\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mstart_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mend_char_idx\u001b[0m=\u001b[3;35mNone\u001b[0m,\n", @@ -505,9 +514,9 @@ "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mmetadata_template\u001b[0m=\u001b[32m'\u001b[0m\u001b[32m{\u001b[0m\u001b[32mkey\u001b[0m\u001b[32m}\u001b[0m\u001b[32m: \u001b[0m\u001b[32m{\u001b[0m\u001b[32mvalue\u001b[0m\u001b[32m}\u001b[0m\u001b[32m'\u001b[0m,\n", "\u001b[2;32m│ │ │ \u001b[0m\u001b[33mmetadata_seperator\u001b[0m=\u001b[32m'\\n'\u001b[0m\n", "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[33mscore\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.032522473484277725\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[33mscore\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.032786883413791656\u001b[0m\n", "\u001b[2;32m│ \u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m2\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[33m...\u001b[0m +\u001b[1;36m4\u001b[0m\n", "\u001b[1m]\u001b[0m\n" ] }, @@ -524,6 +533,179 @@ "pprint(retr_res, max_length=1, max_string=250, max_depth=4)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here are the retrieval results (basic fields) prior to reranking:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
[\n", + "│ {\n", + "│ │ 'path': '$.main-text[88]',\n", + "│ │ 'text': '5 EXPERIMENTS\\npaper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.',\n", + "│ │ 'score': 0.032786883413791656\n", + "│ },\n", + "│ {\n", + "│ │ 'path': '$.main-text[115]',\n", + "│ │ 'text': '6 CONCLUSION\\nIn this paper, we presented the DocLayNet dataset. It provides the document conversion and layout analysis research community a new and challenging dataset to improve and fine-tune novel ML methods on. In contrast to many other datasets,'+270,\n", + "│ │ 'score': 0.0320020467042923\n", + "│ },\n", + "│ {\n", + "│ │ 'path': '$.main-text[74]',\n", + "│ │ 'text': '4 ANNOTATION CAMPAIGN\\nThe complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.',\n", + "│ │ 'score': 0.016129031777381897\n", + "│ },\n", + "│ {\n", + "│ │ 'path': '$.main-text[10]',\n", + "│ │ 'text': 'CCS CONCEPTS\\nPermission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this noti'+160,\n", + "│ │ 'score': 0.01587301678955555\n", + "│ },\n", + "│ {\n", + "│ │ 'path': '$.main-text[11]',\n", + "│ │ 'text': \"CCS CONCEPTS\\nKDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043\",\n", + "│ │ 'score': 0.015625\n", + "│ }\n", + "]\n", + "\n" + ], + "text/plain": [ + "\u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m88\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m'5 EXPERIMENTS\\npaper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.032786883413791656\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m115\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m'6 CONCLUSION\\nIn this paper, we presented the DocLayNet dataset. It provides the document conversion and layout analysis research community a new and challenging dataset to improve and fine-tune novel ML methods on. In contrast to many other datasets,'\u001b[0m+\u001b[1;36m270\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.0320020467042923\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m74\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m'4 ANNOTATION CAMPAIGN\\nThe complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.016129031777381897\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m'CCS CONCEPTS\\nPermission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this noti'\u001b[0m+\u001b[1;36m160\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.01587301678955555\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m\"CCS CONCEPTS\\nKDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author\u001b[0m\u001b[32m(\u001b[0m\u001b[32ms\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043\"\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.015625\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "before_rrk = [\n", + " dict(path=node.metadata[\"path\"], text=node.text, score=node.score)\n", + " for node in retr_res\n", + "]\n", + "pprint(before_rrk, max_string=250)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we apply the reranker on the retrieved results — as you can see, the reranker has indeed changed the order:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
[\n", + "│ {\n", + "│ │ 'path': '$.main-text[115]',\n", + "│ │ 'text': '6 CONCLUSION\\nIn this paper, we presented the DocLayNet dataset. It provides the document conversion and layout analysis research community a new and challenging dataset to improve and fine-tune novel ML methods on. In contrast to many other datasets,'+270,\n", + "│ │ 'score': -1.5540306568145752\n", + "│ },\n", + "│ {\n", + "│ │ 'path': '$.main-text[74]',\n", + "│ │ 'text': '4 ANNOTATION CAMPAIGN\\nThe complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.',\n", + "│ │ 'score': -4.99463415145874\n", + "│ },\n", + "│ {\n", + "│ │ 'path': '$.main-text[88]',\n", + "│ │ 'text': '5 EXPERIMENTS\\npaper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.',\n", + "│ │ 'score': -6.130667686462402\n", + "│ },\n", + "│ {\n", + "│ │ 'path': '$.main-text[10]',\n", + "│ │ 'text': 'CCS CONCEPTS\\nPermission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this noti'+160,\n", + "│ │ 'score': -6.377060890197754\n", + "│ },\n", + "│ {\n", + "│ │ 'path': '$.main-text[11]',\n", + "│ │ 'text': \"CCS CONCEPTS\\nKDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043\",\n", + "│ │ 'score': -7.342046737670898\n", + "│ }\n", + "]\n", + "\n" + ], + "text/plain": [ + "\u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m115\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m'6 CONCLUSION\\nIn this paper, we presented the DocLayNet dataset. It provides the document conversion and layout analysis research community a new and challenging dataset to improve and fine-tune novel ML methods on. In contrast to many other datasets,'\u001b[0m+\u001b[1;36m270\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m-1.5540306568145752\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m74\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m'4 ANNOTATION CAMPAIGN\\nThe complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m-4.99463415145874\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m88\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m'5 EXPERIMENTS\\npaper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m-6.130667686462402\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m10\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m'CCS CONCEPTS\\nPermission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this noti'\u001b[0m+\u001b[1;36m160\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m-6.377060890197754\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'path'\u001b[0m: \u001b[32m'$.main-text\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m'\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'text'\u001b[0m: \u001b[32m\"CCS CONCEPTS\\nKDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author\u001b[0m\u001b[32m(\u001b[0m\u001b[32ms\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043\"\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m-7.342046737670898\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m\n", + "\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "reranked = reranker.postprocess_nodes(nodes=retr_res, query_str=QUERY)\n", + "after_rrk = [\n", + " dict(path=node.metadata[\"path\"], text=node.text, score=node.score)\n", + " for node in reranked\n", + "]\n", + "pprint(after_rrk, max_string=250)" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/poetry.lock b/poetry.lock index 78ea2d3..ac9fa4e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2596,6 +2596,20 @@ files = [ huggingface-hub = ">=0.23.0,<0.24.0" llama-index-core = ">=0.11.0,<0.12.0" +[[package]] +name = "llama-index-postprocessor-flag-embedding-reranker" +version = "0.2.0" +description = "llama-index postprocessor flag embedding reranker integration" +optional = true +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "llama_index_postprocessor_flag_embedding_reranker-0.2.0-py3-none-any.whl", hash = "sha256:66f8d9f3b9879ef7ae85e46f83c42816bdd5240af51e346ab0f628a44c801a9b"}, + {file = "llama_index_postprocessor_flag_embedding_reranker-0.2.0.tar.gz", hash = "sha256:83fc160983dff61f877558ff3b1e5f846c43a638e395955898df46c806f87a3e"}, +] + +[package.dependencies] +llama-index-core = ">=0.11.0,<0.12.0" + [[package]] name = "llama-index-vector-stores-milvus" version = "0.2.1" @@ -7205,9 +7219,9 @@ doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linke test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [extras] -examples = ["flagembedding", "jsonpath-ng", "llama-index-embeddings-huggingface", "llama-index-llms-huggingface-api", "llama-index-vector-stores-milvus", "python-dotenv"] +examples = ["flagembedding", "jsonpath-ng", "llama-index-embeddings-huggingface", "llama-index-llms-huggingface-api", "llama-index-postprocessor-flag-embedding-reranker", "llama-index-vector-stores-milvus", "python-dotenv"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "9c49083ae59e521cfbcc1469eb40758585cf3c0a245b5e21ab67329c08aeab16" +content-hash = "216cbafe74fd88f7f4e6fede29223648e467cb98a142104f8b9b50c21288a61a" diff --git a/pyproject.toml b/pyproject.toml index a51650f..482219a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ python-dotenv = { version = "^1.0.1", optional = true } llama-index-embeddings-huggingface = { version = "^0.3.1", optional = true } llama-index-llms-huggingface-api = { version = "^0.2.0", optional = true } llama-index-vector-stores-milvus = { version = "^0.2.1", optional = true } +llama-index-postprocessor-flag-embedding-reranker = {version = "^0.2.0", optional = true } flagembedding = { version = "^1.2.10", optional = true } jsonpath-ng = { version = "^1.6.1", optional = true } @@ -54,6 +55,7 @@ examples = [ "llama-index-embeddings-huggingface", "llama-index-llms-huggingface-api", "llama-index-vector-stores-milvus", + "llama-index-postprocessor-flag-embedding-reranker", "flagembedding", "jsonpath-ng", ]