Skip to content

Commit

Permalink
Merge pull request #16 from deepset-ai/trec
Browse files Browse the repository at this point in the history
TREC: PoC
  • Loading branch information
davidsbatista authored Jul 15, 2024
2 parents 4473327 + b49a1ab commit 36097e3
Show file tree
Hide file tree
Showing 8 changed files with 365 additions and 43 deletions.
59 changes: 22 additions & 37 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# Local run files
qa.db
**/qa.db
**/*qa*.db
**/test-reports

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand All @@ -20,6 +26,7 @@ parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
Expand Down Expand Up @@ -49,7 +56,6 @@ coverage.xml
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
Expand All @@ -68,11 +74,10 @@ instance/
# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/
# documentation
docs/pydoc/temp/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
Expand All @@ -83,9 +88,7 @@ profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
Expand All @@ -94,22 +97,7 @@ ipython_config.py
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
# pyflow
__pypackages__/

# Celery stuff
Expand Down Expand Up @@ -146,20 +134,17 @@ dmypy.json
# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea

# MacOS
# VSCode
.vscode

# macOS
.DS_Store
*/.DS_Store
**/.DS_Store

# http cache (requests-cache)
**/http_cache.sqlite

# ruff
.ruff_cache
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,16 @@ repos:
rev: v0.5.0
hooks:
- id: ruff
exclude: "datasets/*"
- id: ruff-format
exclude: "datasets/*"


- repo: https://github.com/codespell-project/codespell
rev: v2.3.0
hooks:
- id: codespell
exclude: "datasets/*|evaluations/trec/trec_evaluation.py"
additional_dependencies:
- tomli

Expand Down
4 changes: 2 additions & 2 deletions evaluations/evaluation_aragog.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from pathlib import Path
from typing import List, Tuple

from architectures.basic_rag import basic_rag
from architectures.baseline_rag import built_basic_rag
from haystack import Pipeline
from haystack.components.converters import PyPDFToDocument
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
Expand Down Expand Up @@ -57,7 +57,7 @@ def run_basic_rag(doc_store, sample_questions, embedding_model, top_k):
Runs the basic rag model on a set of sample questions and answers.
"""

rag = basic_rag(document_store=doc_store, embedding_model=embedding_model, top_k=top_k)
rag = built_basic_rag(document_store=doc_store, embedding_model=embedding_model, top_k=top_k)

predicted_answers = []
retrieved_contexts = []
Expand Down
4 changes: 2 additions & 2 deletions evaluations/evaluation_aragog_harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path
from typing import List, Tuple

from architectures.basic_rag import basic_rag
from architectures.baseline_rag import built_basic_rag
from architectures.hyde_rag import rag_with_hyde
from haystack import Pipeline
from haystack.components.converters import PyPDFToDocument
Expand Down Expand Up @@ -90,7 +90,7 @@ def main():
doc_store = indexing(embeddings, chunk_size)

# baseline RAG
rag = basic_rag(document_store=doc_store, embedding_model=embeddings, top_k=top_k)
rag = built_basic_rag(document_store=doc_store, embedding_model=embeddings, top_k=top_k)
rag_components = {
RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata(
name="query_embedder", input_mapping={"query": "text"}
Expand Down
4 changes: 2 additions & 2 deletions evaluations/evaluation_sentence_window_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path
from typing import List, Tuple

from architectures.basic_rag import basic_rag
from architectures.baseline_rag import built_basic_rag
from architectures.sentence_window_retrieval import rag_sentence_window_retrieval
from haystack import Pipeline
from haystack.components.converters import PyPDFToDocument
Expand Down Expand Up @@ -115,7 +115,7 @@ def main():
eval_results_rag_window = EvaluationRunResult(run_name="window-retrieval", inputs=inputs, results=results)

# Baseline RAG
rag = basic_rag(doc_store, embedding_model, top_k)
rag = built_basic_rag(doc_store, embedding_model, top_k)
retrieved_contexts, predicted_answers = run_rag(rag, questions)
results, inputs = run_evaluation(questions, answers, retrieved_contexts, predicted_answers, embedding_model)
eval_results_base_rag = EvaluationRunResult(run_name="base-rag", inputs=inputs, results=results)
Expand Down
30 changes: 30 additions & 0 deletions evaluations/trec/indexing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from pipelines import indexing


def get_qdrant_doc_store(embedding_dim: int = 768):
doc_store = QdrantDocumentStore(
url="localhost",
index="trec2024",
embedding_dim=embedding_dim,
on_disk=True,
recreate_index=True,
hnsw_config={"m": 16, "ef_construct": 64}, # Optional
)

return doc_store


def main():
# make sure you have Qdrant running on localhost
print("Connecting to Qdrant...")
doc_store = get_qdrant_doc_store()
print("Indexing documents...")

# we manually created a sample file of around 150MB for testing purposes
files_to_index = {"../../datasets/TREC/corpus/msmarco_v2.1_doc_segmented_00_sample.json"}
indexing(doc_store, "sentence-transformers/msmarco-distilroberta-base-v2", 128, files_to_index)


if __name__ == "__main__":
main()
84 changes: 84 additions & 0 deletions evaluations/trec/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import json
from typing import List, Set

from haystack import Document, Pipeline, component
from haystack.components.builders import AnswerBuilder, PromptBuilder
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
from haystack.components.generators import OpenAIGenerator
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever


@component
class ParseTRECCorpus:
@staticmethod
def create_document(line: str):
doc = json.loads(line)
return Document(content=doc["segment"], meta={"docid": doc["docid"], "url": doc["url"]})

@component.output_types(segments=List[Document])
def run(self, files: List[str]):
for file in files:
with open(file, "r") as f:
results = [self.create_document(line) for line in f]
return {"segments": results}


def indexing(doc_store, model: str, chunk_size: int, files_to_index: Set[str]):
pipeline = Pipeline()
pipeline.add_component("converter", ParseTRECCorpus())
pipeline.add_component("splitter", DocumentSplitter(split_length=chunk_size, split_overlap=5)) # splitting by word
pipeline.add_component("writer", DocumentWriter(document_store=doc_store, policy=DuplicatePolicy.SKIP))
pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder(model))
pipeline.connect("converter", "splitter")
pipeline.connect("splitter", "embedder")
pipeline.connect("embedder", "writer")
pipeline.run({"converter": {"files": files_to_index}})

return doc_store


def built_basic_rag(document_store, embedding_model):
template = (
"You have to answer the following question based on the contexts given below. "
"If all the contexts are empty answer with None, example: None. "
"Otherwise, analyze all the contexts and build a coherent answer and complete answer. "
"Split your answer into multiple sentences, and for each sentence please provide the context number "
"that you used to generate that sentence."
"{% for document in documents %}"
"Context {{loop.index}}: {{document.content}}"
"{%endfor %}"
"Question: {{question}}"
"Answer:"
)

basic_rag = Pipeline()
basic_rag.add_component(
"query_embedder", SentenceTransformersTextEmbedder(model=embedding_model, progress_bar=False)
)
basic_rag.add_component("retriever", QdrantEmbeddingRetriever(document_store))
basic_rag.add_component("prompt_builder", PromptBuilder(template=template))
basic_rag.add_component("llm", OpenAIGenerator(model="gpt-3.5-turbo"))
basic_rag.add_component("answer_builder", AnswerBuilder())

basic_rag.connect("query_embedder", "retriever.query_embedding")
basic_rag.connect("retriever", "prompt_builder.documents")
basic_rag.connect("prompt_builder", "llm")
basic_rag.connect("llm.replies", "answer_builder.replies")
basic_rag.connect("llm.meta", "answer_builder.meta")
basic_rag.connect("retriever", "answer_builder.documents")

return basic_rag


def pipeline_task_1(document_store, embedding_model):
retrieval = Pipeline()
retrieval.add_component(
"query_embedder", SentenceTransformersTextEmbedder(model=embedding_model, progress_bar=False)
)
retrieval.add_component("retriever", QdrantEmbeddingRetriever(document_store))
retrieval.connect("query_embedder", "retriever.query_embedding")

return retrieval
Loading

0 comments on commit 36097e3

Please sign in to comment.