Merge pull request #16 from deepset-ai/trec

TREC: PoC
deepset-ai · Jul 15, 2024 · 36097e3 · 36097e3
2 parents 4473327 + b49a1ab
commit 36097e3
Show file tree

Hide file tree

Showing 8 changed files with 365 additions and 43 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,9 @@
+# Local run files
+qa.db
+**/qa.db
+**/*qa*.db
+**/test-reports
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -20,6 +26,7 @@ parts/
 sdist/
 var/
 wheels/
+pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
@@ -49,7 +56,6 @@ coverage.xml
 *.py,cover
 .hypothesis/
 .pytest_cache/
-cover/
 
 # Translations
 *.mo
@@ -68,11 +74,10 @@ instance/
 # Scrapy stuff:
 .scrapy
 
-# Sphinx documentation
-docs/_build/
+# documentation
+docs/pydoc/temp/
 
 # PyBuilder
-.pybuilder/
 target/
 
 # Jupyter Notebook
@@ -83,9 +88,7 @@ profile_default/
 ipython_config.py
 
 # pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
+.python-version
 
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
@@ -94,22 +97,7 @@ ipython_config.py
 #   install all needed dependencies.
 #Pipfile.lock
 
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+# pyflow
 __pypackages__/
 
 # Celery stuff
@@ -146,20 +134,17 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
 # PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea
 
-# MacOS
+# VSCode
+.vscode
+
+# macOS
 .DS_Store
-*/.DS_Store
-**/.DS_Store
+
+# http cache (requests-cache)
+**/http_cache.sqlite
+
+# ruff
+.ruff_cache
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,12 +20,16 @@ repos:
     rev: v0.5.0
     hooks:
       - id: ruff
+        exclude: "datasets/*"
       - id: ruff-format
+        exclude: "datasets/*"
+
 
   - repo: https://github.com/codespell-project/codespell
     rev: v2.3.0
     hooks:
       - id: codespell
+        exclude: "datasets/*|evaluations/trec/trec_evaluation.py"
         additional_dependencies:
           - tomli
 

diff --git a/evaluations/evaluation_aragog.py b/evaluations/evaluation_aragog.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 from typing import List, Tuple
 
-from architectures.basic_rag import basic_rag
+from architectures.baseline_rag import built_basic_rag
 from haystack import Pipeline
 from haystack.components.converters import PyPDFToDocument
 from haystack.components.embedders import SentenceTransformersDocumentEmbedder
@@ -57,7 +57,7 @@ def run_basic_rag(doc_store, sample_questions, embedding_model, top_k):
     Runs the basic rag model on a set of sample questions and answers.
     """
 
-    rag = basic_rag(document_store=doc_store, embedding_model=embedding_model, top_k=top_k)
+    rag = built_basic_rag(document_store=doc_store, embedding_model=embedding_model, top_k=top_k)
 
     predicted_answers = []
     retrieved_contexts = []

diff --git a/evaluations/evaluation_aragog_harness.py b/evaluations/evaluation_aragog_harness.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 from typing import List, Tuple
 
-from architectures.basic_rag import basic_rag
+from architectures.baseline_rag import built_basic_rag
 from architectures.hyde_rag import rag_with_hyde
 from haystack import Pipeline
 from haystack.components.converters import PyPDFToDocument
@@ -90,7 +90,7 @@ def main():
     doc_store = indexing(embeddings, chunk_size)
 
     # baseline RAG
-    rag = basic_rag(document_store=doc_store, embedding_model=embeddings, top_k=top_k)
+    rag = built_basic_rag(document_store=doc_store, embedding_model=embeddings, top_k=top_k)
     rag_components = {
         RAGExpectedComponent.QUERY_PROCESSOR: RAGExpectedComponentMetadata(
             name="query_embedder", input_mapping={"query": "text"}

diff --git a/evaluations/evaluation_sentence_window_retrieval.py b/evaluations/evaluation_sentence_window_retrieval.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 from typing import List, Tuple
 
-from architectures.basic_rag import basic_rag
+from architectures.baseline_rag import built_basic_rag
 from architectures.sentence_window_retrieval import rag_sentence_window_retrieval
 from haystack import Pipeline
 from haystack.components.converters import PyPDFToDocument
@@ -115,7 +115,7 @@ def main():
     eval_results_rag_window = EvaluationRunResult(run_name="window-retrieval", inputs=inputs, results=results)
 
     # Baseline RAG
-    rag = basic_rag(doc_store, embedding_model, top_k)
+    rag = built_basic_rag(doc_store, embedding_model, top_k)
     retrieved_contexts, predicted_answers = run_rag(rag, questions)
     results, inputs = run_evaluation(questions, answers, retrieved_contexts, predicted_answers, embedding_model)
     eval_results_base_rag = EvaluationRunResult(run_name="base-rag", inputs=inputs, results=results)

diff --git a/evaluations/trec/indexing.py b/evaluations/trec/indexing.py
@@ -0,0 +1,30 @@
+from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
+from pipelines import indexing
+
+
+def get_qdrant_doc_store(embedding_dim: int = 768):
+    doc_store = QdrantDocumentStore(
+        url="localhost",
+        index="trec2024",
+        embedding_dim=embedding_dim,
+        on_disk=True,
+        recreate_index=True,
+        hnsw_config={"m": 16, "ef_construct": 64},  # Optional
+    )
+
+    return doc_store
+
+
+def main():
+    # make sure you have Qdrant running on localhost
+    print("Connecting to Qdrant...")
+    doc_store = get_qdrant_doc_store()
+    print("Indexing documents...")
+
+    # we manually created a sample file of around 150MB for testing purposes
+    files_to_index = {"../../datasets/TREC/corpus/msmarco_v2.1_doc_segmented_00_sample.json"}
+    indexing(doc_store, "sentence-transformers/msmarco-distilroberta-base-v2", 128, files_to_index)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evaluations/trec/pipelines.py b/evaluations/trec/pipelines.py
@@ -0,0 +1,84 @@
+import json
+from typing import List, Set
+
+from haystack import Document, Pipeline, component
+from haystack.components.builders import AnswerBuilder, PromptBuilder
+from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
+from haystack.components.generators import OpenAIGenerator
+from haystack.components.preprocessors import DocumentSplitter
+from haystack.components.writers import DocumentWriter
+from haystack.document_stores.types import DuplicatePolicy
+from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever
+
+
+@component
+class ParseTRECCorpus:
+    @staticmethod
+    def create_document(line: str):
+        doc = json.loads(line)
+        return Document(content=doc["segment"], meta={"docid": doc["docid"], "url": doc["url"]})
+
+    @component.output_types(segments=List[Document])
+    def run(self, files: List[str]):
+        for file in files:
+            with open(file, "r") as f:
+                results = [self.create_document(line) for line in f]
+        return {"segments": results}
+
+
+def indexing(doc_store, model: str, chunk_size: int, files_to_index: Set[str]):
+    pipeline = Pipeline()
+    pipeline.add_component("converter", ParseTRECCorpus())
+    pipeline.add_component("splitter", DocumentSplitter(split_length=chunk_size, split_overlap=5))  # splitting by word
+    pipeline.add_component("writer", DocumentWriter(document_store=doc_store, policy=DuplicatePolicy.SKIP))
+    pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder(model))
+    pipeline.connect("converter", "splitter")
+    pipeline.connect("splitter", "embedder")
+    pipeline.connect("embedder", "writer")
+    pipeline.run({"converter": {"files": files_to_index}})
+
+    return doc_store
+
+
+def built_basic_rag(document_store, embedding_model):
+    template = (
+        "You have to answer the following question based on the contexts given below. "
+        "If all the contexts are empty answer with None, example: None. "
+        "Otherwise, analyze all the contexts and build a coherent answer and complete answer. "
+        "Split your answer into multiple sentences, and for each sentence please provide the context number "
+        "that you used to generate that sentence."
+        "{% for document in documents %}"
+        "Context {{loop.index}}: {{document.content}}"
+        "{%endfor %}"
+        "Question: {{question}}"
+        "Answer:"
+    )
+
+    basic_rag = Pipeline()
+    basic_rag.add_component(
+        "query_embedder", SentenceTransformersTextEmbedder(model=embedding_model, progress_bar=False)
+    )
+    basic_rag.add_component("retriever", QdrantEmbeddingRetriever(document_store))
+    basic_rag.add_component("prompt_builder", PromptBuilder(template=template))
+    basic_rag.add_component("llm", OpenAIGenerator(model="gpt-3.5-turbo"))
+    basic_rag.add_component("answer_builder", AnswerBuilder())
+
+    basic_rag.connect("query_embedder", "retriever.query_embedding")
+    basic_rag.connect("retriever", "prompt_builder.documents")
+    basic_rag.connect("prompt_builder", "llm")
+    basic_rag.connect("llm.replies", "answer_builder.replies")
+    basic_rag.connect("llm.meta", "answer_builder.meta")
+    basic_rag.connect("retriever", "answer_builder.documents")
+
+    return basic_rag
+
+
+def pipeline_task_1(document_store, embedding_model):
+    retrieval = Pipeline()
+    retrieval.add_component(
+        "query_embedder", SentenceTransformersTextEmbedder(model=embedding_model, progress_bar=False)
+    )
+    retrieval.add_component("retriever", QdrantEmbeddingRetriever(document_store))
+    retrieval.connect("query_embedder", "retriever.query_embedding")
+
+    return retrieval