diff --git a/preprocessing_demo/preprocessing_demo.ipynb b/preprocessing_demo/preprocessing_demo.ipynb new file mode 100644 index 00000000..816f8fd0 --- /dev/null +++ b/preprocessing_demo/preprocessing_demo.ipynb @@ -0,0 +1,1270 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "id": "192e7b22-47c8-41f5-ae1b-f63b0ac5131d", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b7646cdb-fadc-4e57-ad96-05532a211bb5", + "metadata": {}, + "outputs": [], + "source": [ + "# openai work\n", + "from openai import OpenAI\n", + "\n", + "client = OpenAI()\n", + "def stream_openai(client, role, message, model=\"gpt-4o-mini\"):\n", + " stream = client.chat.completions.create(\n", + " model=model,\n", + " messages=[{\"role\": role, \"content\": message}],\n", + " stream=True,\n", + " )\n", + " message = \"\"\n", + " for chunk in stream:\n", + " if chunk.choices[0].delta.content is not None:\n", + " message += chunk.choices[0].delta.content\n", + " return message" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "afb94b80", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrew/micromamba/envs/ragna-dev/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "# Preprocessor test\n", + "from pathlib import Path\n", + "\n", + "from ragna.source_storages import RagnaDemoSourceStorage\n", + "from ragna.source_storages import LanceDB\n", + "from ragna.core import (\n", + " LocalDocument, Document\n", + ")\n", + "\n", + "\n", + "storage = LanceDB()\n", + "\n", + "temporal_doc_paths = [p for p in Path.cwd().joinpath(\"temporal_docs\").iterdir() if p.suffix == \".md\"]\n", + "temporal_docs = [\n", + " LocalDocument.from_path(path) for path in temporal_doc_paths\n", + "]\n", + "storage.store(\"temporal_docs\", temporal_docs)" + ] + }, + { + "cell_type": "markdown", + "id": "97926ef8", + "metadata": {}, + "source": [ + "# Preprocessor that rewords the question with context\n", + "\n", + "## Without any context preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "861713ce-bb5f-484b-b7e3-e96b9b8775ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Last year, a rocket was built.\n" + ] + } + ], + "source": [ + "from typing import Optional\n", + "\n", + "from ragna.assistants import RagnaDemoAssistant\n", + "\n", + "from ragna import Rag\n", + "from ragna.assistants import Gpt4\n", + "\n", + "\n", + "chat = Rag().chat(\n", + " input=None,\n", + " source_storage = storage,\n", + " #assistant=RagnaDemoAssistant,\n", + " assistant=Gpt4,\n", + " corpus_name=\"temporal_docs\",\n", + ")\n", + "_ = await chat.prepare()\n", + "print(await chat.answer(\"What happened last year?\"))" + ] + }, + { + "cell_type": "markdown", + "id": "b85129b4-e087-4de7-9d30-a23f0f849f25", + "metadata": {}, + "source": [ + "# With context preprocessing\n", + "\n", + "To build on this, you could add more context like location" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d6aebb98-a054-4528-967a-d46dbd96bcfa", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Optional\n", + "from datetime import datetime\n", + "from ragna.core import (\n", + "QueryPreprocessor, MetadataFilter, ProcessedQuery\n", + ")\n", + "\n", + "base_prompt = (\n", + "\"\"\"You are a llm agent that is responsible for rewriting an input prompt for a RAG application. \n", + "A question might contain hidden context that may not be recognized by the embedding model. \n", + "An embedding of a question might not generate a close match to an embedding of a statement\n", + "that contains the answer to the question. Although this is technically something that should\n", + "be solved on the embedding model side, it is usually solved by rephrasing the question\n", + "before using it to retrieve sources. For example a question like \"What happened last month?\" \n", + "likely won't get any close matches. Rephrasing the prompt to \"What happened in December 2021?\"\n", + "given that the question is asked January 2022. Things that may be important to consider when \n", + "reworking the prompt would be the current context, that may that is not explicitly asked in the \n", + "question but can be inferred, for instance the current date. \n", + "\n", + "current date: {}\n", + "\n", + "\n", + "Please reword the following prompt \n", + "and only return the reworded prompt. I do not need to know your reasoning:\n", + "\n", + "\n", + "\"\"\".format(datetime.today()))\n", + "\n", + "class TestPreprocessor(QueryPreprocessor):\n", + "\n", + " def __init__(self):\n", + " # self.storage=storage\n", + " # self.assistant=assistant\n", + " self.messages = []\n", + "\n", + " def ask_assistant(self, prompt):\n", + " instruction = (base_prompt + prompt)\n", + " print(instruction)\n", + " assistant_answer = stream_openai(client, \"user\", instruction)\n", + " return assistant_answer\n", + "\n", + " def process(self, query: str, metadata_filter: Optional[MetadataFilter]):\n", + " processed_query = self.ask_assistant(query)\n", + " return ProcessedQuery(\n", + " original_query=query,\n", + " processed_query=processed_query,\n", + " metadata_filter=None,\n", + " processor_name=self.display_name()\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "bf4879d7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "You are a llm agent that is responsible for rewriting an input prompt for a RAG application. \n", + "A question might contain hidden context that may not be recognized by the embedding model. \n", + "An embedding of a question might not generate a close match to an embedding of a statement\n", + "that contains the answer to the question. Although this is technically something that should\n", + "be solved on the embedding model side, it is usually solved by rephrasing the question\n", + "before using it to retrieve sources. For example a question like \"What happened last month?\" \n", + "likely won't get any close matches. Rephrasing the prompt to \"What happened in December 2021?\"\n", + "given that the question is asked January 2022. Things that may be important to consider when \n", + "reworking the prompt would be the current context, that may that is not explicitly asked in the \n", + "question but can be inferred, for instance the current date. \n", + "\n", + "current date: 2025-01-29 23:51:47.609820\n", + "\n", + "\n", + "Please reword the following prompt \n", + "and only return the reworded prompt. I do not need to know your reasoning:\n", + "\n", + "\n", + "What happened last year?\n" + ] + } + ], + "source": [ + "preprocessor = TestPreprocessor()\n", + "processed_query = preprocessor.process(\"What happened last year?\", None)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "cfbbf236", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "What significant events occurred in 2024?\n" + ] + } + ], + "source": [ + "print(processed_query.processed_query)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "a4c92e7c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In 2024, the significant event that occurred was a trip to the moon.\n" + ] + } + ], + "source": [ + "chat = Rag().chat(\n", + " input=None,\n", + " source_storage = storage,\n", + " #assistant=RagnaDemoAssistant,\n", + " assistant=Gpt4,\n", + " corpus_name=\"temporal_docs\",\n", + ")\n", + "_ = await chat.prepare()\n", + "print(await chat.answer(processed_query.processed_query))" + ] + }, + { + "cell_type": "markdown", + "id": "685baa0f", + "metadata": {}, + "source": [ + "## Findings\n", + "\n", + "This preprocesser can be effective for question where context is important. In an \"agentic\" workflow you may have a step that would determine and lookup the relevant context instead of always giving same context features. For example in the above prompt you could have a step to determine whether date is actually relevant to the question and only add it to the prompt if it is.\n", + "\n", + "Note:\n", + "My prompt above may be too leading. Giving it a date example and then asking it a date relevant question may not translate to a question where something like location is important." + ] + }, + { + "cell_type": "markdown", + "id": "22c0920d", + "metadata": {}, + "source": [ + "# Query Expansion" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "84da6383", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[PosixPath('/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/preprocessing_demo/../docs/install.md'), PosixPath('/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/preprocessing_demo/../docs/index.md')]\n" + ] + } + ], + "source": [ + "# Ragna docs\n", + "from ragna.source_storages import LanceDB\n", + "\n", + "docs_path = Path.cwd().joinpath(\"../docs\")\n", + "\n", + "md_files = list(docs_path.glob(\"**/*.md\"))\n", + "print(md_files[:2])\n", + "\n", + "storage = LanceDB()\n", + "\n", + "documents = [\n", + " (\n", + " document\n", + " if isinstance(document, Document)\n", + " else LocalDocument.from_path(document)\n", + " )\n", + " for document in md_files\n", + " ]\n", + "storage.store(\"ragna_docs\", documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "65b23e6a", + "metadata": {}, + "outputs": [], + "source": [ + "#example_question = \"how do I setup locally to contribute?\"\n", + "example_question = \"how do I build a rag application using ragna?\"" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "be46c324", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I'm sorry, but the information provided does not contain any details on how to build a RAG application using Ragna.\n" + ] + } + ], + "source": [ + "chat = Rag().chat(\n", + " input=None,\n", + " source_storage = storage,\n", + " #assistant=RagnaDemoAssistant,\n", + " assistant=Gpt4,\n", + " corpus_name=\"ragna_docs\",\n", + ")\n", + "_ = await chat.prepare()\n", + "print(await chat.answer(example_question))" + ] + }, + { + "cell_type": "markdown", + "id": "145670b6", + "metadata": {}, + "source": [ + "## Preprocessor\n", + "\n", + "This preprocesser will first ask the llm to give a naive answer without any source materials it will then use the naive answer to query the sources which it will then prepend to the original prompt. " + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "ade1b33b", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Optional\n", + "from ragna.core import (\n", + "QueryPreprocessor, MetadataFilter, ProcessedQuery\n", + ")\n", + "\n", + "base_prompt = (\n", + "\"\"\"You are a helpful software engineer. \n", + " how would you answer the following question from a more junior developer?\n", + "\n", + " question: {}\n", + "\"\"\")\n", + "\n", + "class QueryExpansionPreprocessor(QueryPreprocessor):\n", + "\n", + " def __init__(self, storage):\n", + " self.storage=storage\n", + " # self.assistant=assistant\n", + " self.messages = []\n", + "\n", + " def ask_assistant(self, prompt):\n", + " instruction = (base_prompt.format(prompt))\n", + " assistant_answer = stream_openai(client, \"user\", instruction)\n", + " return assistant_answer\n", + "\n", + " def process(self, query: str, metadata_filter: Optional[MetadataFilter]):\n", + " hypothetical_answer = self.ask_assistant(query)\n", + " print(hypothetical_answer)\n", + " sources = self.storage.retrieve(\"ragna_docs\", metadata_filter, hypothetical_answer)\n", + " \n", + " processed_query = \"\\n\".join(s.content for s in sources) + \"\\n\\n\" + query\n", + " return ProcessedQuery(\n", + " original_query=query,\n", + " processed_query=processed_query,\n", + " metadata_filter=None,\n", + " processor_name=self.display_name()\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "00433eae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "To help a junior developer build a RAG (Retrieval-Augmented Generation) application using Ragna, I would first clarify that Ragna is likely a framework or library that supports the creation of RAG applications. Assuming Ragna is a fictional or hypothetical tool that we’re discussing as part of this learning exercise, here's how I would approach answering their question:\n", + "\n", + "---\n", + "\n", + "### Building a RAG Application Using Ragna\n", + "\n", + "**1. Understand RAG Concept:**\n", + "Before diving into the implementation, it’s important to understand what a RAG application is. A RAG combines retrieval and generation components to enhance responses or outputs. Typically, this involves pulling in relevant information from a knowledge base (retrieval) and generating human-like textual responses based on that information (generation).\n", + "\n", + "**2. Setting Up Your Environment:**\n", + "Make sure you have the following prerequisites installed:\n", + "- Python (version that is compatible with Ragna)\n", + "- Necessary dependencies (install via pip, e.g., `pip install ragna`)\n", + "\n", + "**3. Basic Structure of a RAG Application:**\n", + "Here’s a high-level process to create your application:\n", + "\n", + "**Step 1: Import Required Libraries**\n", + "Ensure that you import Ragna in your script.\n", + "```python\n", + "import ragna\n", + "```\n", + "\n", + "**Step 2: Configure Your Database/Knowledge Source**\n", + "You would need to set up the retrieval part. This could be a database, API, or a local set of documents.\n", + "\n", + "Example:\n", + "```python\n", + "knowledge_base = ragna.KnowledgeBase(source='path/to/your/documents')\n", + "```\n", + "\n", + "**Step 3: Create Your Retrieval Model**\n", + "Use Ragna's retrieval utilities to set up a model that fetches information based on user queries.\n", + "\n", + "```python\n", + "retriever = ragna.Retriever(knowledge_base=knowledge_base)\n", + "```\n", + "\n", + "**Step 4: Create Your Generation Model**\n", + "Depending on what kind of responses you want, you can configure a generative model that uses a transformer or another NLP model.\n", + "\n", + "```python\n", + "generator = ragna.Generator(model='gpt-3.5-turbo') # or any supported model\n", + "```\n", + "\n", + "**Step 5: Combine Retrieval and Generation**\n", + "To create a RAG flow, you fetch relevant documents and use them as context for generating responses.\n", + "\n", + "```python\n", + "def rag_response(query):\n", + " # Retrieve relevant information\n", + " retrieved_docs = retriever.retrieve(query)\n", + " \n", + " # Generate response based on retrieved info\n", + " response = generator.generate(query, context=retrieved_docs)\n", + " return response\n", + "```\n", + "\n", + "**Step 6: Test Your Application**\n", + "You can now define a simple loop to interact with your application.\n", + "```python\n", + "if __name__ == \"__main__\":\n", + " user_input = input(\"Ask your question: \")\n", + " while user_input.lower() != 'exit':\n", + " answer = rag_response(user_input)\n", + " print(\"Response:\", answer)\n", + " user_input = input(\"Ask your question: \")\n", + "```\n", + "\n", + "### Additional Tips:\n", + "- **Documentation:** Always refer to the [Ragna documentation](#) for detailed and updated usage patterns.\n", + "- **Fine-tuning:** You might want to fine-tune the generative model on specific data to get better responses.\n", + "- **Performance:** Consider optimizing retrieval speed if you deal with a large knowledge base.\n", + "- **Error Handling:** Make sure to include error handling for scenarios where the retrieval might not return results or the generation fails.\n", + "\n", + "This is a simplified overview, but it should give you a good start on building a RAG application using Ragna. If you have any questions or need clarification on specific points, feel free to ask!\n" + ] + } + ], + "source": [ + "preprocessor = QueryExpansionPreprocessor(storage)\n", + "\n", + "preprocessed_query = preprocessor.process(example_question, None)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "e28532e4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Frequently asked questions\n", + "\n", + "## Why should I use Ragna and not X?\n", + "\n", + "!!! tip \"TL;DR\"\n", + "\n", + " Ragna is the only tool out there that specializes in orchestrating RAG use cases\n", + " with arbitrary components, as well as offering a Python API, a REST API, and a web\n", + " UI for that.\n", + "\n", + "!!! note\n", + "\n", + " Although we try to be objective as possible, this section is inheritly biased. If\n", + " you are the author of a package we don't mention below but think we should or your\n", + " package is mentiond but you feel we have mischaracterized it, please\n", + " [get in touch](https://github.com/Quansight/ragna/discussions).\n", + "\n", + "After the emergence of ChatGPT in November of 2022, the field of LLMs exploded. Today,\n", + "there are many providers for LLM REST APIs out there. With these, we also have a\n", + "plethora of Python packages to build applications around the provided APIs. We cannot\n", + "summarize the whole field here, so we stick to large projects in the Python ecosystem\n", + "for comparison.\n", + "\n", + "| library or application | RAG | arbitrary components | Python API | REST API | web UI |\n", + "| ----------------------------------------------------- | :----------------: | :------------------: | :----------------: | :----------------: | :----------------: |\n", + "| Ragna | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |\n", + "| [LangChain](https://www.langchain.com/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\n", + "| [Langroid](https://langroid.github.io/langroid/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\n", + "| [LlamaIndex](https://www.llamaindex.ai/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\n", + "| [Ollama](https://ollama.com/) | :x: | :x: | :heavy_check_mark: | :heavy_check_mark: |\n", + "# Frequently asked questions\n", + "\n", + "## Why should I use Ragna and not X?\n", + "\n", + "!!! tip \"TL;DR\"\n", + "\n", + " Ragna is the only tool out there that specializes in orchestrating RAG use cases\n", + " with arbitrary components, as well as offering a Python API, a REST API, and a web\n", + " UI for that.\n", + "\n", + "!!! note\n", + "\n", + " Although we try to be objective as possible, this section is inheritly biased. If\n", + " you are the author of a package we don't mention below but think we should or your\n", + " package is mentiond but you feel we have mischaracterized it, please\n", + " [get in touch](https://github.com/Quansight/ragna/discussions).\n", + "\n", + "After the emergence of ChatGPT in November of 2022, the field of LLMs exploded. Today,\n", + "there are many providers for LLM REST APIs out there. With these, we also have a\n", + "plethora of Python packages to build applications around the provided APIs. We cannot\n", + "summarize the whole field here, so we stick to large projects in the Python ecosystem\n", + "for comparison.\n", + "\n", + "| library or application | RAG | arbitrary components | Python API | REST API | web UI |\n", + "| ----------------------------------------------------- | :----------------: | :------------------: | :----------------: | :----------------: | :----------------: |\n", + "| Ragna | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |\n", + "| [LangChain](https://www.langchain.com/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\n", + "| [Langroid](https://langroid.github.io/langroid/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\n", + "| [LlamaIndex](https://www.llamaindex.ai/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\n", + "| [Ollama](https://ollama.com/) | :x: | :x: | :heavy_check_mark: | :heavy_check_mark: |\n", + "\n", + "how do I build a rag application using ragna?\n" + ] + } + ], + "source": [ + "print(preprocessed_query.processed_query)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "4a4703bf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I'm sorry, but the provided sources do not contain information on how to build a RAG application using Ragna.\n" + ] + } + ], + "source": [ + "chat = Rag().chat(\n", + " input=None,\n", + " source_storage = storage,\n", + " #assistant=RagnaDemoAssistant,\n", + " assistant=Gpt4,\n", + " corpus_name=\"ragna_docs\",\n", + ")\n", + "_ = await chat.prepare()\n", + "print(await chat.answer(preprocessed_query.processed_query))" + ] + }, + { + "cell_type": "markdown", + "id": "66f0bd72", + "metadata": {}, + "source": [ + "## Findings:\n", + "\n", + "For this to be valuable I think you would need to be asking a question that is related but perhaps indirectly to the source material. It also may be unreliable. When I ran this code on 1/24/25, a got a response attempting the question, and I continued to get a response across tries. When I tried again 1/27/25 I got \"I'm sorry, but the provided sources do not contain information on how to build a RAG application.\" To the best of my knowledge there were no changes and the processed query was the same in both instances.\n", + "\n", + "response from 1/24/25:\n", + "\n", + "```\n", + "To build a RAG (Retrieval-Augmented Generation) application using Ragna, you would need to follow these steps:\n", + "\n", + "1. Install Ragna: You can install Ragna using pip, a package installer for Python. You can do this by running the command `pip install ragna` in your terminal.\n", + "\n", + "2. Set up your configuration: Ragna uses a configuration file to set up the environment. This includes setting up the local root directory for storing files, the authentication class for user authentication, the key-value store class for temporary storage, and the document class for uploading and reading documents. You can also set up the source storages and assistants available for the user to use.\n", + "\n", + "3. Write your RAG components: You would need to write your own RAG components, which include the document, source storage, and assistant classes. These classes should inherit from the respective base classes in Ragna and implement the required methods.\n", + "\n", + "4. Run your application: Once you have set up your configuration and written your RAG components, you can run your application. If you are using the REST API, you would need to set up the hostname, port, and root path for the API.\n", + "\n", + "Please note that this is a high-level overview of the process. The exact steps\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "3e1564c3", + "metadata": {}, + "source": [ + "# Query Expansion with ReRanking\n", + "\n", + "The next preprocessor works the same as the above except that it also ranks similarity of the question to the sources that were found off of a retrieval from the naive answer. This could be useful when you want to expand the query but not too much." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "9bcb3adb", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Optional\n", + "from ragna.core import (\n", + "QueryPreprocessor, MetadataFilter, ProcessedQuery\n", + ")\n", + "from sentence_transformers import CrossEncoder\n", + "\n", + "base_prompt = (\n", + "\"\"\"You are a helpful software engineer. Hypothetically given a specific software package in python,\n", + " how would you answer the following question from a more junior developer?\n", + "\n", + " question:\n", + "\"\"\")\n", + "\n", + "class QueryExpansionReRankingPreprocessor(QueryPreprocessor):\n", + "\n", + " def __init__(self, storage):\n", + " self.storage=storage\n", + " # self.assistant=assistant\n", + " self.messages = []\n", + " self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')\n", + "\n", + " def ask_assistant(self, prompt):\n", + " instruction = (base_prompt + prompt)\n", + " assistant_answer = stream_openai(client, \"user\", instruction)\n", + " return assistant_answer\n", + " \n", + " def rank_sources(self, sources, hypothetical_answer):\n", + " return sources\n", + "\n", + " def cross_encoder_rerank(self, query, sources):\n", + " #cross encoder reranker\n", + "\n", + " # Extract text content from Document objects and convert to strings\n", + " document_texts = [doc.content for doc in sources]\n", + " # Create pairs as strings\n", + " pairs = [[query, doc_text] for doc_text in document_texts]\n", + " # Predict scores for pairs\n", + " scores = self.cross_encoder.predict(pairs)\n", + " docs_and_scores = list(zip(scores, document_texts))\n", + " sorted_ds = sorted(docs_and_scores, key=lambda x: x[0], reverse=True)\n", + " top_docs = [x[1] for x in sorted_ds[:2]]\n", + " return top_docs\n", + "\n", + " def process(self, query: str, metadata_filter: Optional[MetadataFilter]):\n", + " hypothetical_answer = self.ask_assistant(query)\n", + " sources = self.storage.retrieve(\"ragna_docs\", metadata_filter, hypothetical_answer)\n", + " top_docs = self.cross_encoder_rerank(query, sources)\n", + "\n", + " \n", + " processed_query = \"\\n\".join(s for s in top_docs) + \"\\n\\n\" + query\n", + " return ProcessedQuery(\n", + " original_query=query,\n", + " processed_query=processed_query,\n", + " metadata_filter=None,\n", + " processor_name=self.display_name()\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "2b3a28b0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrew/micromamba/envs/ragna-dev/lib/python3.10/site-packages/torch/cuda/__init__.py:129: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1737846236919/work/c10/cuda/CUDAFunctions.cpp:108.)\n", + " return torch._C._cuda_getDeviceCount() > 0\n" + ] + }, + { + "data": { + "text/plain": [ + "ProcessedQuery(original_query='how do I build a rag application using ragna?', processed_query='# Frequently asked questions\\n\\n## Why should I use Ragna and not X?\\n\\n!!! tip \"TL;DR\"\\n\\n Ragna is the only tool out there that specializes in orchestrating RAG use cases\\n with arbitrary components, as well as offering a Python API, a REST API, and a web\\n UI for that.\\n\\n!!! note\\n\\n Although we try to be objective as possible, this section is inheritly biased. If\\n you are the author of a package we don\\'t mention below but think we should or your\\n package is mentiond but you feel we have mischaracterized it, please\\n [get in touch](https://github.com/Quansight/ragna/discussions).\\n\\nAfter the emergence of ChatGPT in November of 2022, the field of LLMs exploded. Today,\\nthere are many providers for LLM REST APIs out there. With these, we also have a\\nplethora of Python packages to build applications around the provided APIs. We cannot\\nsummarize the whole field here, so we stick to large projects in the Python ecosystem\\nfor comparison.\\n\\n| library or application | RAG | arbitrary components | Python API | REST API | web UI |\\n| ----------------------------------------------------- | :----------------: | :------------------: | :----------------: | :----------------: | :----------------: |\\n| Ragna | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |\\n| [LangChain](https://www.langchain.com/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\\n| [Langroid](https://langroid.github.io/langroid/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\\n| [LlamaIndex](https://www.llamaindex.ai/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\\n| [Ollama](https://ollama.com/) | :x: | :x: | :heavy_check_mark: | :heavy_check_mark: |\\n# Frequently asked questions\\n\\n## Why should I use Ragna and not X?\\n\\n!!! tip \"TL;DR\"\\n\\n Ragna is the only tool out there that specializes in orchestrating RAG use cases\\n with arbitrary components, as well as offering a Python API, a REST API, and a web\\n UI for that.\\n\\n!!! note\\n\\n Although we try to be objective as possible, this section is inheritly biased. If\\n you are the author of a package we don\\'t mention below but think we should or your\\n package is mentiond but you feel we have mischaracterized it, please\\n [get in touch](https://github.com/Quansight/ragna/discussions).\\n\\nAfter the emergence of ChatGPT in November of 2022, the field of LLMs exploded. Today,\\nthere are many providers for LLM REST APIs out there. With these, we also have a\\nplethora of Python packages to build applications around the provided APIs. We cannot\\nsummarize the whole field here, so we stick to large projects in the Python ecosystem\\nfor comparison.\\n\\n| library or application | RAG | arbitrary components | Python API | REST API | web UI |\\n| ----------------------------------------------------- | :----------------: | :------------------: | :----------------: | :----------------: | :----------------: |\\n| Ragna | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |\\n| [LangChain](https://www.langchain.com/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\\n| [Langroid](https://langroid.github.io/langroid/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\\n| [LlamaIndex](https://www.llamaindex.ai/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\\n| [Ollama](https://ollama.com/) | :x: | :x: | :heavy_check_mark: | :heavy_check_mark: |\\n\\nhow do I build a rag application using ragna?', metadata_filter=None, processing_history=[])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessor = QueryExpansionReRankingPreprocessor(storage)\n", + "\n", + "preprocessed_query = preprocessor.process(example_question, None)\n", + "preprocessed_query" + ] + }, + { + "cell_type": "markdown", + "id": "a9599280", + "metadata": {}, + "source": [ + "## Findings:\n", + "\n", + "I think I need a more extensive dataset and appropriate question. This seems like it could be useful in the case where you need to expand the query, but you have an extensive dataset so don't want to expand it by too much. I am also concerned that since you are scoring their similarity to the question and not the naive answer that you may not actually be scoring on the important expanded similarity.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "7ff74727", + "metadata": {}, + "source": [ + "# More experiments\n", + "\n", + "Below is an attempt to have an llm come up the with the steps necessary to find relevant documents. " + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "b43de1d7", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Optional\n", + "from ragna.core import (\n", + "QueryPreprocessor, MetadataFilter, ProcessedQuery\n", + ")\n", + "from sentence_transformers import CrossEncoder\n", + "\n", + "base_prompt = (\n", + "\"\"\"Given the question below. Make a plan on how best to answer the question.\n", + " \n", + "retrieve relevant sources from a vector database to best answer the question and only return the plan.\n", + "\n", + " question:\n", + "\"\"\")\n", + "\n", + "class QueryExpansionReRankingPreprocessor(QueryPreprocessor):\n", + "\n", + " def __init__(self, storage):\n", + " self.storage=storage\n", + " # self.assistant=assistant\n", + " self.messages = []\n", + " self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')\n", + "\n", + " def ask_assistant(self, prompt):\n", + " instruction = (base_prompt + prompt)\n", + " assistant_answer = stream_openai(client, \"user\", instruction)\n", + " return assistant_answer\n", + "\n", + " def process(self, query: str, metadata_filter: Optional[MetadataFilter]):\n", + " hypothetical_answer = self.ask_assistant(query)\n", + " print(hypothetical_answer)\n", + " sources = self.storage.retrieve(\"ragna_docs\", metadata_filter, hypothetical_answer)\n", + "\n", + "\n", + " \n", + " processed_query = \"\\n\".join(s.content for s in sources) + \"\\n\\n\" + query\n", + " return ProcessedQuery(\n", + " original_query=query,\n", + " processed_query=processed_query,\n", + " metadata_filter=None,\n", + " processor_name=self.display_name()\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "6c8cac77", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### Plan to Retrieve Relevant Sources from a Vector Database for Building a RAG Application\n", + "\n", + "1. **Define RAG Application Criteria:**\n", + " - Identify key topics and subtopics related to “RAG application” (e.g., architecture, frameworks, data sources, models).\n", + " - Establish keywords and phrases (e.g., \"retrieval-augmented generation\", \"building RAG applications\", \"RAG model implementation\").\n", + "\n", + "2. **Prepare Vector Database Query:**\n", + " - Formulate vector embeddings for identified keywords using a pre-trained language model.\n", + " - Create a combination of queries that encompass both general concepts and specific technical aspects.\n", + "\n", + "3. **Vector Search Execution:**\n", + " - Utilize the vector search capabilities of the database to retrieve documents that closely match the embeddings created.\n", + " - Set parameters for the search (e.g., similarity thresholds, number of results).\n", + "\n", + "4. **Filter Retrieved Sources:**\n", + " - Review the returned results for relevance based on predefined criteria (e.g., publication date, credibility, topic alignment).\n", + " - Remove duplicates and non-informative sources that do not contribute to building a RAG application.\n", + "\n", + "5. **Categorize Collected Sources:**\n", + " - Classify the relevant sources into categories such as guides, tutorials, research papers, and example projects.\n", + " - Note the type of content each source offers to aid in addressing different aspects of building RAG applications.\n", + "\n", + "6. **Summarize Key Insights:**\n", + " - For each relevant source, extract and summarize key insights, methodologies, and best practices.\n", + " - Highlight any specific examples or case studies shown in the sources to provide practical application perspectives.\n", + "\n", + "7. **Organize and Present Findings:**\n", + " - Compile the summarized insights and categorize them for easy access and understanding.\n", + " - Prepare to present the findings clearly, either as a report, a slide deck, or an annotated bibliography.\n", + "\n", + "8. **Review and Update Process:**\n", + " - Implement a process for regularly reviewing the vector database for new entries and frequently updating the findings based on the latest developments in RAG technologies.\n", + " - Keep track of versions and modifications in any methodology or implementation knowledge related to RAG applications. \n", + "\n", + "This plan aims to ensure that the information retrieved is comprehensive, relevant, and up-to-date for effectively building a RAG application.\n" + ] + }, + { + "data": { + "text/plain": [ + "'# Frequently asked questions\\n\\n## Why should I use Ragna and not X?\\n\\n!!! tip \"TL;DR\"\\n\\n Ragna is the only tool out there that specializes in orchestrating RAG use cases\\n with arbitrary components, as well as offering a Python API, a REST API, and a web\\n UI for that.\\n\\n!!! note\\n\\n Although we try to be objective as possible, this section is inheritly biased. If\\n you are the author of a package we don\\'t mention below but think we should or your\\n package is mentiond but you feel we have mischaracterized it, please\\n [get in touch](https://github.com/Quansight/ragna/discussions).\\n\\nAfter the emergence of ChatGPT in November of 2022, the field of LLMs exploded. Today,\\nthere are many providers for LLM REST APIs out there. With these, we also have a\\nplethora of Python packages to build applications around the provided APIs. We cannot\\nsummarize the whole field here, so we stick to large projects in the Python ecosystem\\nfor comparison.\\n\\n| library or application | RAG | arbitrary components | Python API | REST API | web UI |\\n| ----------------------------------------------------- | :----------------: | :------------------: | :----------------: | :----------------: | :----------------: |\\n| Ragna | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |\\n| [LangChain](https://www.langchain.com/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\\n| [Langroid](https://langroid.github.io/langroid/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\\n| [LlamaIndex](https://www.llamaindex.ai/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\\n| [Ollama](https://ollama.com/) | :x: | :x: | :heavy_check_mark: | :heavy_check_mark: |\\n# Frequently asked questions\\n\\n## Why should I use Ragna and not X?\\n\\n!!! tip \"TL;DR\"\\n\\n Ragna is the only tool out there that specializes in orchestrating RAG use cases\\n with arbitrary components, as well as offering a Python API, a REST API, and a web\\n UI for that.\\n\\n!!! note\\n\\n Although we try to be objective as possible, this section is inheritly biased. If\\n you are the author of a package we don\\'t mention below but think we should or your\\n package is mentiond but you feel we have mischaracterized it, please\\n [get in touch](https://github.com/Quansight/ragna/discussions).\\n\\nAfter the emergence of ChatGPT in November of 2022, the field of LLMs exploded. Today,\\nthere are many providers for LLM REST APIs out there. With these, we also have a\\nplethora of Python packages to build applications around the provided APIs. We cannot\\nsummarize the whole field here, so we stick to large projects in the Python ecosystem\\nfor comparison.\\n\\n| library or application | RAG | arbitrary components | Python API | REST API | web UI |\\n| ----------------------------------------------------- | :----------------: | :------------------: | :----------------: | :----------------: | :----------------: |\\n| Ragna | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |\\n| [LangChain](https://www.langchain.com/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\\n| [Langroid](https://langroid.github.io/langroid/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\\n| [LlamaIndex](https://www.llamaindex.ai/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | :x: |\\n| [Ollama](https://ollama.com/) | :x: | :x: | :heavy_check_mark: | :heavy_check_mark: |\\n\\nhow do I build a rag application?'" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessor = QueryExpansionReRankingPreprocessor(storage)\n", + "\n", + "preprocessed_query = preprocessor.process(example_question, None)\n", + "preprocessed_query.processed_query" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "216e832c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'To break down the steps provided in your text for use in a Python program, first, we should identify the key components that will help shape the structure of the application. Since the steps deal with FAQs about Ragna and its comparison with similar libraries/applications, we can organize them into functions or data structures to facilitate user queries or interactions.\\n\\nHere\\'s how you could structure your Python program:\\n\\n1. **Create a FAQ Class**: This class will contain the FAQs and methods to retrieve them.\\n\\n2. **Implement Comparison Table**: This can be either a method in the same class or a separate data structure that compares Ragna with other libraries.\\n\\n3. **Define Methods for User Interaction**: Allow users to query information about Ragna and the comparison.\\n\\nHere’s an example implementation:\\n\\n```python\\nclass RagnaFAQ:\\n def __init__(self):\\n self.faqs = {\\n \"Why should I use Ragna and not X?\": {\\n \"TL;DR\": \"Ragna is the only tool out there that specializes in orchestrating RAG use cases \"\\n \"with arbitrary components, as well as offering a Python API, a REST API, and a web UI.\",\\n \"Note\": \"Although we try to be objective as possible, this section is inherently biased. If \"\\n \"you are the author of a package we don\\'t mention below but think we should \"\\n \"or your package is mentioned but you feel we have mischaracterized it, please \"\\n \"get in touch: https://github.com/Quansight/ragna/discussions.\",\\n \"Background\": \"The field of LLMs exploded after the emergence of ChatGPT in November of 2022. \"\\n \"Today, there are many providers for LLM REST APIs out there, along with numerous \"\\n \"Python packages.\"\\n },\\n }\\n\\n self.comparison_table = [\\n {\\n \"library\": \"Ragna\",\\n \"RAG\": True,\\n \"arbitrary_components\": True,\\n \"Python_API\": True,\\n \"REST_API\": True,\\n \"web_UI\": True\\n },\\n {\\n \"library\": \"LangChain\",\\n \"RAG\": True,\\n \"arbitrary_components\": True,\\n \"Python_API\": True,\\n \"REST_API\": False,\\n \"web_UI\": False\\n },\\n {\\n \"library\": \"Langroid\",\\n \"RAG\": True,\\n \"arbitrary_components\": True,\\n \"Python_API\": True,\\n \"REST_API\": False,\\n \"web_UI\": False\\n },\\n {\\n \"library\": \"LlamaIndex\",\\n \"RAG\": True,\\n \"arbitrary_components\": True,\\n \"Python_API\": True,\\n \"REST_API\": False,\\n \"web_UI\": False\\n },\\n {\\n \"library\": \"Ollama\",\\n \"RAG\": False,\\n \"arbitrary_components\": False,\\n \"Python_API\": True,\\n \"REST_API\": True,\\n \"web_UI\": False\\n }\\n ]\\n\\n def get_faq(self, question):\\n return self.faqs.get(question, \"Question not found.\")\\n\\n def print_comparison(self):\\n print(f\\'{\"Library\":<15} {\"RAG\":<5} {\"Arbitrary Components\":<20} {\"Python API\":<12} {\"REST API\":<10} {\"Web UI\":<10}\\')\\n for item in self.comparison_table:\\n print(f\"{item[\\'library\\']:<15} {str(item[\\'RAG\\']):<5} {str(item[\\'arbitrary_components\\']):<20} \"\\n f\"{str(item[\\'Python_API\\']):<12} {str(item[\\'REST_API\\']):<10} {str(item[\\'web_UI\\']):<10}\")\\n\\n\\n# Usage\\nragna_faq = RagnaFAQ()\\n\\n# Print TL;DR answer\\nprint(ragna_faq.get_faq(\"Why should I use Ragna and not X?\")[\"TL;DR\"])\\n\\n# Print the comparison table\\nragna_faq.print_comparison()\\n```\\n\\n### Explanation:\\n\\n- **Class `RagnaFAQ`**: Encapsulates the frequently asked questions and the comparison data.\\n- **`__init__` Method**: Initializes a dictionary (`faqs`) to store FAQs and a list of dictionaries to store comparison data.\\n- **`get_faq` Method**: Retrieves the answer to a specific FAQ based on the question asked.\\n- **`print_comparison` Method**: Prints a formatted comparison table for Ragna and other libraries.\\n\\n### Running the Program:\\n\\nWhen you run this program, it prints the TL;DR from the FAQ about why to use Ragna and displays a formatted comparison table of Ragna versus other libraries. You can adapt this structure to add more FAQs or modify the comparison criteria as needed.'" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "base_prompt = (\"\"\"how would I break up the following steps to use in a python program:\n", + "\n", + "steps:\n", + "\"\"\")\n", + "prompt = preprocessed_query.processed_query\n", + "instruction = (base_prompt + prompt)\n", + "assistant_answer = stream_openai(client, \"user\", instruction)\n", + "assistant_answer" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "deb1cb96", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "To break down the steps provided in your text for use in a Python program, first, we should identify the key components that will help shape the structure of the application. Since the steps deal with FAQs about Ragna and its comparison with similar libraries/applications, we can organize them into functions or data structures to facilitate user queries or interactions.\n", + "\n", + "Here's how you could structure your Python program:\n", + "\n", + "1. **Create a FAQ Class**: This class will contain the FAQs and methods to retrieve them.\n", + "\n", + "2. **Implement Comparison Table**: This can be either a method in the same class or a separate data structure that compares Ragna with other libraries.\n", + "\n", + "3. **Define Methods for User Interaction**: Allow users to query information about Ragna and the comparison.\n", + "\n", + "Here’s an example implementation:\n", + "\n", + "```python\n", + "class RagnaFAQ:\n", + " def __init__(self):\n", + " self.faqs = {\n", + " \"Why should I use Ragna and not X?\": {\n", + " \"TL;DR\": \"Ragna is the only tool out there that specializes in orchestrating RAG use cases \"\n", + " \"with arbitrary components, as well as offering a Python API, a REST API, and a web UI.\",\n", + " \"Note\": \"Although we try to be objective as possible, this section is inherently biased. If \"\n", + " \"you are the author of a package we don't mention below but think we should \"\n", + " \"or your package is mentioned but you feel we have mischaracterized it, please \"\n", + " \"get in touch: https://github.com/Quansight/ragna/discussions.\",\n", + " \"Background\": \"The field of LLMs exploded after the emergence of ChatGPT in November of 2022. \"\n", + " \"Today, there are many providers for LLM REST APIs out there, along with numerous \"\n", + " \"Python packages.\"\n", + " },\n", + " }\n", + "\n", + " self.comparison_table = [\n", + " {\n", + " \"library\": \"Ragna\",\n", + " \"RAG\": True,\n", + " \"arbitrary_components\": True,\n", + " \"Python_API\": True,\n", + " \"REST_API\": True,\n", + " \"web_UI\": True\n", + " },\n", + " {\n", + " \"library\": \"LangChain\",\n", + " \"RAG\": True,\n", + " \"arbitrary_components\": True,\n", + " \"Python_API\": True,\n", + " \"REST_API\": False,\n", + " \"web_UI\": False\n", + " },\n", + " {\n", + " \"library\": \"Langroid\",\n", + " \"RAG\": True,\n", + " \"arbitrary_components\": True,\n", + " \"Python_API\": True,\n", + " \"REST_API\": False,\n", + " \"web_UI\": False\n", + " },\n", + " {\n", + " \"library\": \"LlamaIndex\",\n", + " \"RAG\": True,\n", + " \"arbitrary_components\": True,\n", + " \"Python_API\": True,\n", + " \"REST_API\": False,\n", + " \"web_UI\": False\n", + " },\n", + " {\n", + " \"library\": \"Ollama\",\n", + " \"RAG\": False,\n", + " \"arbitrary_components\": False,\n", + " \"Python_API\": True,\n", + " \"REST_API\": True,\n", + " \"web_UI\": False\n", + " }\n", + " ]\n", + "\n", + " def get_faq(self, question):\n", + " return self.faqs.get(question, \"Question not found.\")\n", + "\n", + " def print_comparison(self):\n", + " print(f'{\"Library\":<15} {\"RAG\":<5} {\"Arbitrary Components\":<20} {\"Python API\":<12} {\"REST API\":<10} {\"Web UI\":<10}')\n", + " for item in self.comparison_table:\n", + " print(f\"{item['library']:<15} {str(item['RAG']):<5} {str(item['arbitrary_components']):<20} \"\n", + " f\"{str(item['Python_API']):<12} {str(item['REST_API']):<10} {str(item['web_UI']):<10}\")\n", + "\n", + "\n", + "# Usage\n", + "ragna_faq = RagnaFAQ()\n", + "\n", + "# Print TL;DR answer\n", + "print(ragna_faq.get_faq(\"Why should I use Ragna and not X?\")[\"TL;DR\"])\n", + "\n", + "# Print the comparison table\n", + "ragna_faq.print_comparison()\n", + "```\n", + "\n", + "### Explanation:\n", + "\n", + "- **Class `RagnaFAQ`**: Encapsulates the frequently asked questions and the comparison data.\n", + "- **`__init__` Method**: Initializes a dictionary (`faqs`) to store FAQs and a list of dictionaries to store comparison data.\n", + "- **`get_faq` Method**: Retrieves the answer to a specific FAQ based on the question asked.\n", + "- **`print_comparison` Method**: Prints a formatted comparison table for Ragna and other libraries.\n", + "\n", + "### Running the Program:\n", + "\n", + "When you run this program, it prints the TL;DR from the FAQ about why to use Ragna and displays a formatted comparison table of Ragna versus other libraries. You can adapt this structure to add more FAQs or modify the comparison criteria as needed.\n" + ] + } + ], + "source": [] + }, + { + "cell_type": "markdown", + "id": "43989a76", + "metadata": {}, + "source": [ + "## Create metadata filter\n", + "\n", + "Below is an attempt to create metadata filters themselves with an llm." + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "ade43de4", + "metadata": {}, + "outputs": [], + "source": [ + "base_prompt = (\n", + "\"\"\"\n", + "question: What happened last year?\n", + "\n", + "This question will be used to retrieve relevant sources from a vector database.\n", + "The vector database contains sources with attached metadata. Below is an example \n", + "filters that can be used to more effectively retrieve data from the vector database.\n", + "For the above question. Return a list of relevant metadata filters.\n", + "\n", + "[\n", + " MetadataFilter.raw(\"raw\"),\n", + " MetadataFilter.and_(\n", + " [\n", + " MetadataFilter.raw(\"raw\"),\n", + " MetadataFilter.eq(\"key\", \"value\"),\n", + " ]\n", + " ),\n", + " \n", + " MetadataFilter.or_(\n", + " [\n", + " MetadataFilter.raw(\"raw\"),\n", + " MetadataFilter.eq(\"key\", \"value\"),\n", + " ]\n", + " ),\n", + " MetadataFilter.and_(\n", + " [\n", + " MetadataFilter.raw(\"raw\"),\n", + " MetadataFilter.or_(\n", + " [\n", + " MetadataFilter.eq(\"key\", \"value\"),\n", + " MetadataFilter.ne(\"other_key\", \"other_value\"),\n", + " ]\n", + " ),\n", + " ]\n", + " ),\n", + " MetadataFilter.or_(\n", + " [\n", + " MetadataFilter.raw(\"raw\"),\n", + " MetadataFilter.and_(\n", + " [\n", + " MetadataFilter.eq(\"key\", \"value\"),\n", + " MetadataFilter.ne(\"other_key\", \"other_value\"),\n", + " ]\n", + " ),\n", + " ]\n", + " ),\n", + " MetadataFilter.eq(\"key\", \"value\"),\n", + " MetadataFilter.ne(\"key\", \"value\"),\n", + " MetadataFilter.lt(\"key\", 1),\n", + " MetadataFilter.le(\"key\", 0),\n", + " MetadataFilter.gt(\"key\", 1),\n", + " MetadataFilter.ge(\"key\", 0),\n", + " MetadataFilter.in_(\"key\", [\"foo\", \"bar\"]),\n", + " MetadataFilter.not_in(\"key\", [\"foo\", \"bar\"]),\n", + "] \n", + "\n", + "The possible metadata for the relevant sources are as follows:\n", + "\"\"\"\n", + ")\n", + "\n", + "class MetadataFilterPreprocessor(QueryPreprocessor):\n", + "\n", + " def __init__(self, storage, corpus_name):\n", + " self.storage=storage\n", + " self.corpus_name = \"ragna_docs\"\n", + " # self.assistant=assistant\n", + " self.messages = []\n", + "\n", + " def ask_assistant(self, prompt):\n", + "\n", + " instruction = (base_prompt + prompt)\n", + " assistant_answer = stream_openai(client, \"user\", instruction)\n", + " return assistant_answer\n", + "\n", + " def process(self, query: str, metadata_filter: Optional[MetadataFilter]):\n", + " prompt = self.storage.list_metadata(self.corpus_name)\n", + " metadata = str(prompt[self.corpus_name])\n", + " metadata_filters = self.ask_assistant(metadata)\n", + " print(metadata_filters)\n", + " #sources = self.storage.retrieve(\"ragna_docs\", metadata_filter, hypothetical_answer)\n", + "\n", + " sources = \"\"\n", + " processed_query = \"\\n\".join(s.content for s in sources) + \"\\n\\n\" + query\n", + " return ProcessedQuery(\n", + " original_query=query,\n", + " processed_query=processed_query,\n", + " metadata_filter=None,\n", + " processor_name=self.display_name()\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "e9e2987d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "To effectively retrieve relevant sources from the vector database for the question about what happened last year, we can apply the following metadata filters:\n", + "\n", + "```python\n", + "[\n", + " MetadataFilter.eq(\"year\", 2022), # Assuming we're looking for events from the previous year\n", + " MetadataFilter.eq(\"document_name\", \"release-notes.md\"), # Assuming release notes may contain relevant yearly summaries\n", + " MetadataFilter.eq(\"key\", \"year\"), # Filter that could relate to a specific key about events\n", + " MetadataFilter.or_(\n", + " [\n", + " MetadataFilter.eq(\"key\", \"event\"), # This could refer to specific events reported in 2022\n", + " MetadataFilter.eq(\"key\", \"news\"), # This could refer to news summaries from 2022\n", + " ]\n", + " ),\n", + " MetadataFilter.and_(\n", + " [\n", + " MetadataFilter.gt(\"size\", 0), # Ensures that we are not retrieving empty documents\n", + " MetadataFilter.eq(\"extension\", \".md\"), # Filter to only include markdown documents\n", + " ]\n", + " ),\n", + " MetadataFilter.in_(\"document_id\", [\"0424ae53-ae02-46c8-9b0f-5c416fac0472\", \"9c4bfdb7-8ee2-42ee-ab8c-b068bdc2cc61\"]), # If there's known relevant document IDs\n", + "]\n", + "```\n", + "\n", + "These filters can help narrow down the relevant sources to those that specifically address events or significant occurrences from the previous year, while also ensuring that the documents fit within expected formats and sizes.\n" + ] + }, + { + "data": { + "text/plain": [ + "ProcessedQuery(original_query='What happened last year?', processed_query='\\n\\nWhat happened last year?', metadata_filter=None, processing_history=[])" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "MetadataFilterPreprocessor(storage, \"ragna_docs\").process(\"What happened last year?\", None)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "f2ff8b15", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'path': '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/install.md', 'extension': '.md', 'size': 1304}\n", + "{'path': '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/index.md', 'extension': '.md', 'size': 1866}\n", + "{'path': '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/community/welcome.md', 'extension': '.md', 'size': 1469}\n", + "{'path': '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/community/contribute.md', 'extension': '.md', 'size': 3623}\n", + "{'path': '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/explanations/what-is-rag.md', 'extension': '.md', 'size': 136}\n", + "{'path': '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/examples/README.md', 'extension': '.md', 'size': 12}\n", + "{'path': '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/references/python-api.md', 'extension': '.md', 'size': 128}\n", + "{'path': '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/references/deploy.md', 'extension': '.md', 'size': 970}\n", + "{'path': '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/references/config.md', 'extension': '.md', 'size': 3466}\n", + "{'path': '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/references/release-notes.md', 'extension': '.md', 'size': 25239}\n", + "{'path': '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/references/faq.md', 'extension': '.md', 'size': 4954}\n", + "{'path': '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/tutorials/README.md', 'extension': '.md', 'size': 13}\n" + ] + } + ], + "source": [ + "for document in documents:\n", + " print(document.metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "ffb3e4c6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"{'document_id': ('str', ['0424ae53-ae02-46c8-9b0f-5c416fac0472', '09d71a8a-c807-488b-bbb8-be731b880086', '0f72b676-9b07-4d22-b983-615b6fe084ba', '1d159e07-a0ec-4c6f-8bf4-0917725afc3e', '1e4a4428-e7e9-405f-9b6f-c40d2bde31c2', '24cc63bc-69b1-4b3b-bf2c-bf11e867a689', '25e7b0f5-9a2f-405f-9df3-c8f4e6d68ff0', '264171f2-2785-4a52-b223-19e525813290', '26eb3473-2db4-42cd-bc7f-64fe968f1b6b', '2a4ef41f-c049-4f98-9d0e-4c83a488edb4', '3e38190c-85de-4871-94e3-45a5c6f890e7', '4046f5fe-4f0c-4812-a8a3-ebf2598d2803', '43db5fee-cb9e-4f4f-8fca-b08b00127295', '5d26e37e-2434-490f-889c-b80492619836', '6d8efa94-5ad8-476d-bdf8-39f46e3b6117', '71f432ec-e2a0-47d5-b0a7-f873d5083e7c', '72fd4d34-d42f-481b-9b76-0a12b14f5cab', '73bc7d98-3c2d-466b-b8c3-ccff46a98892', '7c87c95a-aba4-4199-9648-69e2459c8d02', '7d50b63c-d2ef-4990-8ec6-3de124904853', '82b1ad33-3d9f-4f61-b1af-ab56e8b94bf5', '8a7278c7-7f1b-48c8-9948-36a3b6aae524', '8b01fc99-c6cb-469f-8371-f3738d51e4fa', '9c4bfdb7-8ee2-42ee-ab8c-b068bdc2cc61', 'a4c3753a-8bd3-458f-83e2-6a9667800bf0', 'a74cd1dd-ca0e-4a36-ad0b-1613436e149c', 'acf11616-dbb1-429f-9fc4-0b1b41f0db67', 'd289cf68-e428-4d4b-9497-2c351fbaf087', 'd2dfbca9-f86a-4736-9029-70d14278ec8c', 'd43c5679-6e13-48cc-bab6-4d4e3e5eed13', 'd4463915-e417-4686-862e-abf4d7e2003d', 'deebffb0-8707-4cb6-84a9-45b10f0bd19c', 'edb58664-7caa-4e82-b7bd-229c1f9f59a4', 'ef0177eb-b217-4295-b383-1046a984ceda', 'fb1abbf7-22b8-4b6b-81a5-e451f0cf4f62', 'fd7af4fd-27b7-4a6c-94a8-254c1a16fc9b']), 'document_name': ('str', ['README.md', 'config.md', 'contribute.md', 'deploy.md', 'faq.md', 'index.md', 'install.md', 'python-api.md', 'release-notes.md', 'welcome.md', 'what-is-rag.md']), 'path': ('str', ['/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/community/contribute.md', '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/community/welcome.md', '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/examples/README.md', '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/explanations/what-is-rag.md', '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/index.md', '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/install.md', '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/references/config.md', '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/references/deploy.md', '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/references/faq.md', '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/references/python-api.md', '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/references/release-notes.md', '/home/andrew/.dropbox-hm/Dropbox/quansight/dev/ragna/ragna/docs/tutorials/README.md']), 'extension': ('str', ['.md']), 'size': ('int', [12, 13, 128, 136, 970, 1304, 1469, 1866, 3466, 3623, 4954, 25239])}\"" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str(storage.list_metadata(\"ragna_docs\")[\"ragna_docs\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71d08865", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ragna-dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/preprocessing_demo/temporal_docs/doc_0.md b/preprocessing_demo/temporal_docs/doc_0.md new file mode 100644 index 00000000..7c331aac --- /dev/null +++ b/preprocessing_demo/temporal_docs/doc_0.md @@ -0,0 +1,3 @@ +# Project report 2022 + +This year we we decided to go to the moon diff --git a/preprocessing_demo/temporal_docs/doc_1.md b/preprocessing_demo/temporal_docs/doc_1.md new file mode 100644 index 00000000..052108c9 --- /dev/null +++ b/preprocessing_demo/temporal_docs/doc_1.md @@ -0,0 +1,3 @@ +# Project report 2023 + +This year we built a rocket diff --git a/preprocessing_demo/temporal_docs/doc_2.md b/preprocessing_demo/temporal_docs/doc_2.md new file mode 100644 index 00000000..d9a2ebad --- /dev/null +++ b/preprocessing_demo/temporal_docs/doc_2.md @@ -0,0 +1,3 @@ +# Project report 2024 + +This year we went to the moon diff --git a/preprocessing_demo/temporal_docs/mkdocs.py b/preprocessing_demo/temporal_docs/mkdocs.py new file mode 100644 index 00000000..4ede1269 --- /dev/null +++ b/preprocessing_demo/temporal_docs/mkdocs.py @@ -0,0 +1,19 @@ +from pathlib import Path + +docs = [ + """# Project report 2022 + +This year we we decided to go to the moon""", + """# Project report 2023 + +This year we built a rocket""", + """# Project report 2024 + +This year we went to the moon""", +] + +base_path = Path(__file__).parent +for i, doc in enumerate(docs): + doc_path = base_path / f"doc_{i}.md" + with open(doc_path, "w") as f: + f.write(doc) diff --git a/ragna/core/__init__.py b/ragna/core/__init__.py index 7e297275..e2d3a117 100644 --- a/ragna/core/__init__.py +++ b/ragna/core/__init__.py @@ -21,6 +21,9 @@ "Source", "SourceStorage", "PlainTextDocumentHandler", + "QueryProcessingStep", + "ProcessedQuery", + "QueryPreprocessor", ] from ._utils import ( @@ -51,6 +54,9 @@ Component, Message, MessageRole, + ProcessedQuery, + QueryPreprocessor, + QueryProcessingStep, Source, SourceStorage, ) diff --git a/ragna/core/_components.py b/ragna/core/_components.py index fc0d5e44..4dfb6c42 100644 --- a/ragna/core/_components.py +++ b/ragna/core/_components.py @@ -5,12 +5,14 @@ import functools import inspect import uuid +from dataclasses import field from datetime import datetime, timezone from typing import ( Any, AsyncIterable, AsyncIterator, Iterator, + List, Optional, Type, Union, @@ -302,3 +304,32 @@ def answer(self, messages: list[Message]) -> Iterator[str]: Answer. """ ... + + +class QueryProcessingStep(pydantic.BaseModel): + original_query: str + processed_query: str + metadata_filter: Optional[MetadataFilter] = None + processor_name: str = "" + + +class ProcessedQuery(pydantic.BaseModel): + """original query is the query as it was passed to the preprocessor. + processed query is the query after each step of the processing pipeline. + metadata_filter is the metadata filter that was applied to the query.""" + + original_query: str + processed_query: str + metadata_filter: Optional[MetadataFilter] = None + processing_history: List[QueryProcessingStep] = field(default_factory=list) + + +class QueryPreprocessor(Component, abc.ABC): + """Abstract base class for query preprocessors.""" + + @abc.abstractmethod + def process( + self, query: str, metadata_filter: Optional[MetadataFilter] = None + ) -> ProcessedQuery: + """Preprocess a query.""" + ... diff --git a/ragna/core/_rag.py b/ragna/core/_rag.py index 121f39a0..9af0008d 100644 --- a/ragna/core/_rag.py +++ b/ragna/core/_rag.py @@ -28,7 +28,14 @@ from ragna._utils import as_async_iterator, as_awaitable, default_user -from ._components import Assistant, Component, Message, MessageRole, SourceStorage +from ._components import ( + Assistant, + Component, + Message, + MessageRole, + QueryPreprocessor, + SourceStorage, +) from ._document import Document, LocalDocument from ._metadata_filter import MetadataFilter from ._utils import RagnaException, merge_models @@ -96,7 +103,6 @@ def _load_component( ) -> Optional[C]: cls: type[C] instance: Optional[C] - if isinstance(component, Component): instance = cast(C, component) cls = type(instance) @@ -148,6 +154,7 @@ def chat( *, source_storage: Union[SourceStorage, type[SourceStorage], str], assistant: Union[Assistant, type[Assistant], str], + preprocessor: Optional[QueryPreprocessor] = None, corpus_name: str = "default", **params: Any, ) -> Chat: @@ -167,11 +174,14 @@ def chat( corpus_name: Corpus of documents to use. **params: Additional parameters passed to the source storage and assistant. """ + if preprocessor is not None: + preprocessor = (cast(preprocessor, self._load_component(preprocessor)),) # type: ignore[arg-type] return Chat( self, input=input, source_storage=cast(SourceStorage, self._load_component(source_storage)), # type: ignore[arg-type] assistant=cast(Assistant, self._load_component(assistant)), # type: ignore[arg-type] + preprocessor=preprocessor, corpus_name=corpus_name, **params, ) @@ -239,6 +249,7 @@ def __init__( *, source_storage: SourceStorage, assistant: Assistant, + preprocessor: QueryPreprocessor = None, corpus_name: str = "default", **params: Any, ) -> None: @@ -248,6 +259,7 @@ def __init__( self.source_storage = source_storage self.assistant = assistant self.corpus_name = corpus_name + self.preprocessor = preprocessor special_params = SpecialChatParams().model_dump() special_params.update(params) @@ -299,9 +311,16 @@ async def answer(self, prompt: str, *, stream: bool = False) -> Message: http_status_code=status.HTTP_400_BAD_REQUEST, http_detail=RagnaException.EVENT, ) + if self.preprocessor is not None: + processed = self.preprocessor.process(prompt, self.metadata_filter) + prompt = processed.processed_query + self.metadata_filter = processed.metadata_filter sources = await self._as_awaitable( - self.source_storage.retrieve, self.corpus_name, self.metadata_filter, prompt + self.source_storage.retrieve, + self.corpus_name, + self.metadata_filter, + prompt, ) if not sources: event = "Unable to retrieve any sources." diff --git a/ragna/preprocessors/__init__.py b/ragna/preprocessors/__init__.py new file mode 100644 index 00000000..84947b4a --- /dev/null +++ b/ragna/preprocessors/__init__.py @@ -0,0 +1,10 @@ +__all__ = [ + "RagnaDemoPreprocessor", +] + +from ragna._utils import fix_module + +from ._demo import RagnaDemoPreprocessor + +fix_module(globals()) +del fix_module diff --git a/ragna/preprocessors/_demo.py b/ragna/preprocessors/_demo.py new file mode 100644 index 00000000..f2059cd1 --- /dev/null +++ b/ragna/preprocessors/_demo.py @@ -0,0 +1,30 @@ +from typing import Optional + +from ragna.core import ( + MetadataFilter, + ProcessedQuery, + QueryPreprocessor, + QueryProcessingStep, +) + + +class RagnaDemoPreprocessor(QueryPreprocessor): + def process( + self, query: str, metadata_filter: Optional[MetadataFilter] = None + ) -> ProcessedQuery: + """Retrieval query is the original query, answer query is the processed query.""" + processed_query = """This is a demo preprocessor. It doesn't do anything to the query. original query: """ + processed_query += query + return ProcessedQuery( + original_query=query, + processed_query=processed_query, + metadata_filter=metadata_filter or None, + processing_history=[ + QueryProcessingStep( + original_query=query, + processed_query=query, + metadata_filter=metadata_filter, + processor_name=self.display_name(), + ) + ], + ) diff --git a/tests/preprocessors/__init__.py b/tests/preprocessors/__init__.py new file mode 100644 index 00000000..e69de29b