From e98d51aac9aa975095cf5ce705f06018f0b4757d Mon Sep 17 00:00:00 2001
From: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
Date: Tue, 18 Feb 2025 10:48:22 -0800
Subject: [PATCH] Add musique adapter base (#525)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **Bug Fixes**
- Improved data handling by updating the dataset file path and ensuring
answers are consistently converted to lowercase for reliable processing.

- **Tests**
- Introduced unit tests to validate that data adapters instantiate
correctly, return non-empty content, and respect specified limits.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com>
Co-authored-by: Boris <boris@topoteretes.com>
---
 .../benchmark_adapters/musique_adapter.py     | 17 +---
 .../tests/unit/benchmark_adapters_test.py     | 78 +++++++++++++++++++
 2 files changed, 81 insertions(+), 14 deletions(-)
 create mode 100644 evals/eval_framework/tests/unit/benchmark_adapters_test.py

diff --git a/evals/eval_framework/benchmark_adapters/musique_adapter.py b/evals/eval_framework/benchmark_adapters/musique_adapter.py
index 982e8e8e8..53858cbb0 100644
--- a/evals/eval_framework/benchmark_adapters/musique_adapter.py
+++ b/evals/eval_framework/benchmark_adapters/musique_adapter.py
@@ -4,7 +4,7 @@
 from typing import Optional, Union, Any, LiteralString
 import zipfile
 
-import gdown  # pip install gdown
+import gdown
 
 from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
 
@@ -17,7 +17,7 @@ class MusiqueQAAdapter(BaseBenchmarkAdapter):
 
     dataset_info = {
         # Name of the final file we want to load
-        "filename": "musique_ans_v1.0_dev.jsonl",
+        "filename": "data/musique_ans_v1.0_dev.jsonl",
         # A Google Drive URL (or share link) to the ZIP containing this file
         "download_url": "https://drive.google.com/file/d/1tGdADlNjWFaHLeZZGShh2IRcpO6Lv24h/view?usp=sharing",
         # The name of the ZIP archive we expect after downloading
@@ -51,16 +51,13 @@ def load_corpus(
                     "Set auto_download=True or manually place the file."
                 )
 
-        # 2. Read the JSONL file
         with open(target_filename, "r", encoding="utf-8") as f:
             data = [json.loads(line) for line in f]
 
-        # 3. (Optional) sample a subset of items
         if limit is not None and 0 < limit < len(data):
             random.seed(seed)
             data = random.sample(data, limit)
 
-        # 4. Build up corpus_list and question_answer_pairs
         corpus_list = []
         question_answer_pairs = []
 
@@ -70,10 +67,7 @@ def load_corpus(
             combined_paragraphs = " ".join(paragraph["paragraph_text"] for paragraph in paragraphs)
             corpus_list.append(combined_paragraphs)
 
-            # Example question & answer
-            # Adjust keys to match your actual JSON structure if needed
             question = item.get("question", "")
-            # If you have a known 'answer' key, or sometimes it's "answer_aliases", adapt accordingly
             answer = item.get("answer", "")
 
             question_answer_pairs.append(
@@ -100,20 +94,15 @@ def _musique_download_file(self) -> None:
             return
 
         print(f"Attempting to download from Google Drive: {url}")
-        # Using gdown to download the ZIP from a Google Drive link
         gdown.download(url=url, output=zip_filename, quiet=False, fuzzy=True)
 
-        # Unzip the downloaded file
         if os.path.exists(zip_filename):
             print(f"Unzipping {zip_filename} ...")
             with zipfile.ZipFile(zip_filename, "r") as zip_ref:
-                zip_ref.extractall()  # Extract to current directory
-            # Optionally remove the ZIP after extraction
-            os.remove(zip_filename)
+                zip_ref.extractall()
         else:
             raise FileNotFoundError(f"Failed to download the zip file: {zip_filename}")
 
-        # Optional check: ensure the final .jsonl appeared
         if not os.path.exists(target_filename):
             raise FileNotFoundError(
                 f"After unzipping, '{target_filename}' not found. "
diff --git a/evals/eval_framework/tests/unit/benchmark_adapters_test.py b/evals/eval_framework/tests/unit/benchmark_adapters_test.py
new file mode 100644
index 000000000..a3e295910
--- /dev/null
+++ b/evals/eval_framework/tests/unit/benchmark_adapters_test.py
@@ -0,0 +1,78 @@
+import pytest
+import random
+from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
+from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter
+from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
+from evals.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter
+
+
+ADAPTER_CLASSES = [
+    HotpotQAAdapter,
+    MusiqueQAAdapter,
+    DummyAdapter,
+    TwoWikiMultihopAdapter,
+]
+
+
+@pytest.mark.parametrize("AdapterClass", ADAPTER_CLASSES)
+def test_adapter_can_instantiate_and_load(AdapterClass):
+    """
+    Basic smoke test: instantiate each adapter, call load_corpus with no limit,
+    and ensure it returns the expected tuple of (list, list).
+    """
+    adapter = AdapterClass()
+    result = adapter.load_corpus()
+
+    assert isinstance(result, tuple), f"{AdapterClass.__name__} did not return a tuple."
+    assert len(result) == 2, f"{AdapterClass.__name__} returned tuple of length != 2."
+
+    corpus_list, qa_pairs = result
+    assert isinstance(corpus_list, list), f"{AdapterClass.__name__} corpus_list is not a list."
+    assert isinstance(qa_pairs, list), (
+        f"{AdapterClass.__name__} question_answer_pairs is not a list."
+    )
+
+
+@pytest.mark.parametrize("AdapterClass", ADAPTER_CLASSES)
+def test_adapter_returns_some_content(AdapterClass):
+    """
+    Verify that the adapter returns some data and that each QA dict
+    at least has a 'question' and 'answer' key (you can extend or remove as needed).
+    """
+    adapter = AdapterClass()
+
+    corpus_list, qa_pairs = adapter.load_corpus(limit=3)  # small limit
+    # We don't know how large the dataset is, but we expect at least 1 item
+    assert len(corpus_list) > 0, f"{AdapterClass.__name__} returned an empty corpus_list."
+    assert len(qa_pairs) > 0, f"{AdapterClass.__name__} returned an empty question_answer_pairs."
+
+    # Check the shape
+    assert len(corpus_list) == len(qa_pairs), (
+        f"{AdapterClass.__name__} corpus_list and question_answer_pairs "
+        "should typically be the same length. Adjust if your adapter differs."
+    )
+
+    for item in qa_pairs:
+        assert "question" in item, f"{AdapterClass.__name__} missing 'question' key in QA pair."
+        assert "answer" in item, f"{AdapterClass.__name__} missing 'answer' key in QA pair."
+
+
+@pytest.mark.parametrize("AdapterClass", ADAPTER_CLASSES)
+def test_adapter_limit(AdapterClass):
+    """
+    Check that the `limit` parameter correctly restricts the amount of data returned.
+    We'll test with limit=5.
+    """
+    adapter = AdapterClass()
+
+    limit = 5
+    corpus_list, qa_pairs = adapter.load_corpus(limit=limit)
+
+    # Confirm that we didn't receive more than 'limit'
+    # (Some adapters might be allowed to return fewer if the dataset is small)
+    assert len(corpus_list) <= limit, (
+        f"{AdapterClass.__name__} returned more items than requested limit={limit}."
+    )
+    assert len(qa_pairs) <= limit, (
+        f"{AdapterClass.__name__} returned more QA items than requested limit={limit}."
+    )