Skip to content

Commit

Permalink
feat: simplify RAG API
Browse files Browse the repository at this point in the history
  • Loading branch information
lsorber committed Dec 3, 2024
1 parent 851311c commit bbde96f
Show file tree
Hide file tree
Showing 12 changed files with 359 additions and 338 deletions.
75 changes: 61 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,38 +157,85 @@ insert_document(Path("Special Relativity.pdf"), config=my_config)

### 3. Searching and Retrieval-Augmented Generation (RAG)

Now, you can search for chunks with vector search, keyword search, or a hybrid of the two. You can also rerank the search results with the configured reranker. And you can use any search method of your choice (`hybrid_search` is the default) together with reranking to answer questions with RAG:
#### 3.1 Simple RAG pipeline

Now you can run a simple but powerful RAG pipeline that consists of retrieving the most relevant chunk spans (each of which is a list of consecutive chunks) with hybrid search and reranking, converting the user prompt to a RAG instruction and appending it to the message history, and finally generating the RAG response:

```python
from raglite import create_rag_instruction, rag, retrieve_rag_context

# Retrieve relevant chunk spans with hybrid search and reranking:
user_prompt = "How is intelligence measured?"
chunk_spans = retrieve_rag_context(query=user_prompt, num_chunks=5, config=my_config)

# Append a RAG instruction based on the user prompt and context to the message history:
messages = [] # Or start with an existing message history.
messages.append(create_rag_instruction(user_prompt=user_prompt, context=chunk_spans))

# Stream the RAG response:
stream = rag(messages, config=my_config)
for update in stream:
print(update, end="")

# Access the documents cited in the RAG response:
documents = [chunk_span.document for chunk_span in chunk_spans]
```

#### 3.2 Advanced RAG pipeline

> [!TIP]
> 🥇 Reranking can significantly improve the output quality of a RAG application. To add reranking to your application: first search for a larger set of 20 relevant chunks, then rerank them with a [rerankers](https://github.com/AnswerDotAI/rerankers) reranker, and finally keep the top 5 chunks.
In addition to the simple RAG pipeline, RAGLite also offers more advanced control over the individual steps of the pipeline. A full pipeline consists of several steps:

1. Searching for relevant chunks with keyword, vector, or hybrid search
2. Retrieving the chunks from the database
3. Reranking the chunks and truncating the results to the top 5
4. Extending the chunks with their neighbors and grouping them into chunk spans
5. Converting the user prompt to a RAG instruction and appending it to the message history
6. Streaming an LLM response to the message history
7. Accessing the cited documents from the chunk spans

```python
# Search for chunks:
from raglite import hybrid_search, keyword_search, vector_search

prompt = "How is intelligence measured?"
chunk_ids_vector, _ = vector_search(prompt, num_results=20, config=my_config)
chunk_ids_keyword, _ = keyword_search(prompt, num_results=20, config=my_config)
chunk_ids_hybrid, _ = hybrid_search(prompt, num_results=20, config=my_config)
user_prompt = "How is intelligence measured?"
chunk_ids_vector, _ = vector_search(user_prompt, num_results=20, config=my_config)
chunk_ids_keyword, _ = keyword_search(user_prompt, num_results=20, config=my_config)
chunk_ids_hybrid, _ = hybrid_search(user_prompt, num_results=20, config=my_config)

# Retrieve chunks:
from raglite import retrieve_chunks

chunks_hybrid = retrieve_chunks(chunk_ids_hybrid, config=my_config)

# Rerank chunks:
# Rerank chunks and keep the top 5 (optional, but recommended):
from raglite import rerank_chunks

chunks_reranked = rerank_chunks(prompt, chunks_hybrid, config=my_config)
chunks_reranked = rerank_chunks(user_prompt, chunks_hybrid, config=my_config)
chunks_reranked = chunks_reranked[:5]

# Extend chunks with their neighbors and group them into chunk spans:
from raglite import retrieve_chunk_spans

chunk_spans = retrieve_chunk_spans(chunks_reranked, config=my_config)

# Append a RAG instruction based on the user prompt and context to the message history:
from raglite import create_rag_instruction

messages = [] # Or start with an existing message history.
messages.append(create_rag_instruction(user_prompt=user_prompt, context=chunk_spans))

# Answer questions with RAG:
# Stream the RAG response:
from raglite import rag

prompt = "What does it mean for two events to be simultaneous?"
stream = rag(prompt, config=my_config)
stream = rag(messages, config=my_config)
for update in stream:
print(update, end="")

# You can also pass a search method or search results directly:
stream = rag(prompt, search=hybrid_search, config=my_config)
stream = rag(prompt, search=chunks_reranked, config=my_config)
# Access the documents cited in the RAG response:
documents = [chunk_span.document for chunk_span in chunk_spans]
```

### 4. Computing and using an optimal query adapter
Expand All @@ -200,7 +247,7 @@ RAGLite can compute and apply an [optimal closed-form query adapter](src/raglite
from raglite import insert_evals, update_query_adapter

insert_evals(num_evals=100, config=my_config)
update_query_adapter(config=my_config) # From here, simply call vector_search to use the query adapter.
update_query_adapter(config=my_config) # From here, every vector search will use the query adapter.
```

### 5. Evaluation of retrieval and generation
Expand Down
35 changes: 18 additions & 17 deletions src/raglite/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,38 +5,39 @@
from raglite._eval import answer_evals, evaluate, insert_evals
from raglite._insert import insert_document
from raglite._query_adapter import update_query_adapter
from raglite._rag import async_generate, generate, get_context_segments
from raglite._rag import async_rag, create_rag_instruction, rag, retrieve_rag_context
from raglite._search import (
hybrid_search,
keyword_search,
rerank_chunks,
retrieve_chunk_spans,
retrieve_chunks,
retrieve_segments,
vector_search,
)

__all__ = [
# Config
"RAGLiteConfig",
"answer_evals",
"async_generate",
# CLI
"cli",
"evaluate",
# RAG
"generate",
"get_context_segments",
# Search
"hybrid_search",
# Insert
"insert_document",
# Evaluate
"insert_evals",
# Search
"hybrid_search",
"keyword_search",
"rerank_chunks",
"vector_search",
"retrieve_chunks",
"retrieve_segments",
"retrieve_chunk_spans",
"rerank_chunks",
# RAG
"retrieve_rag_context",
"create_rag_instruction",
"async_rag",
"rag",
# Query adapter
"update_query_adapter",
"vector_search",
# Evaluate
"insert_evals",
"answer_evals",
"evaluate",
# CLI
"cli",
]
38 changes: 20 additions & 18 deletions src/raglite/_chainlit.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,20 @@

from raglite import (
RAGLiteConfig,
async_generate,
get_context_segments,
async_rag,
create_rag_instruction,
hybrid_search,
insert_document,
rerank_chunks,
retrieve_chunk_spans,
retrieve_chunks,
)
from raglite._markdown import document_to_markdown

async_insert_document = cl.make_async(insert_document)
async_hybrid_search = cl.make_async(hybrid_search)
async_retrieve_chunks = cl.make_async(retrieve_chunks)
async_retrieve_chunk_spans = cl.make_async(retrieve_chunk_spans)
async_rerank_chunks = cl.make_async(rerank_chunks)


Expand Down Expand Up @@ -85,35 +87,35 @@ async def handle_message(user_message: cl.Message) -> None:
step.input = Path(file.path).name
await async_insert_document(Path(file.path), config=config)
# Append any inline attachments to the user prompt.
user_prompt = f"{user_message.content}\n\n" + "\n\n".join(
f'<attachment index="{i}">\n{attachment.strip()}\n</attachment>'
for i, attachment in enumerate(inline_attachments)
user_prompt = (
"\n\n".join(
f'<attachment index="{i}">\n{attachment.strip()}\n</attachment>'
for i, attachment in enumerate(inline_attachments)
)
+ f"\n\n{user_message.content}"
)
# Search for relevant contexts for RAG.
async with cl.Step(name="search", type="retrieval") as step:
step.input = user_message.content
chunk_ids, _ = await async_hybrid_search(query=user_prompt, num_results=10, config=config)
chunks = await async_retrieve_chunks(chunk_ids=chunk_ids, config=config)
step.output = chunks
step.elements = [ # Show the top 3 chunks inline.
cl.Text(content=str(chunk), display="inline") for chunk in chunks[:3]
step.elements = [ # Show the top chunks inline.
cl.Text(content=str(chunk), display="inline") for chunk in chunks[:5]
]
# Rerank the chunks.
# Rerank the chunks and group them into chunk spans.
async with cl.Step(name="rerank", type="rerank") as step:
step.input = chunks
chunks = await async_rerank_chunks(query=user_prompt, chunk_ids=chunks, config=config)
step.output = chunks
step.elements = [ # Show the top 3 chunks inline.
cl.Text(content=str(chunk), display="inline") for chunk in chunks[:3]
chunk_spans = await async_retrieve_chunk_spans(chunks[:5], config=config)
step.output = chunk_spans
step.elements = [ # Show the top chunk spans inline.
cl.Text(content=str(chunk_span), display="inline") for chunk_span in chunk_spans
]
# Stream the LLM response.
assistant_message = cl.Message(content="")
context_segments = get_context_segments(user_prompt, config=config)
async for token in async_generate(
prompt=user_prompt,
messages=cl.chat_context.to_openai()[-5:-1], # type: ignore[no-untyped-call]
context_segments=context_segments,
config=config,
):
messages: list[dict[str, str]] = cl.chat_context.to_openai() # type: ignore[no-untyped-call]
messages.append(create_rag_instruction(user_prompt=user_prompt, context=chunk_spans))
async for token in async_rag(messages, config=config):
await assistant_message.stream_token(token)
await assistant_message.update() # type: ignore[no-untyped-call]
Loading

0 comments on commit bbde96f

Please sign in to comment.