Skip to content

Commit

Permalink
Add musique adapter base (#525)
Browse files Browse the repository at this point in the history
<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **Bug Fixes**
- Improved data handling by updating the dataset file path and ensuring
answers are consistently converted to lowercase for reliable processing.
  
- **Tests**
- Introduced unit tests to validate that data adapters instantiate
correctly, return non-empty content, and respect specified limits.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: Igor Ilic <[email protected]>
Co-authored-by: Boris <[email protected]>
  • Loading branch information
3 people authored Feb 18, 2025
1 parent 4efdb29 commit e98d51a
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 14 deletions.
17 changes: 3 additions & 14 deletions evals/eval_framework/benchmark_adapters/musique_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Optional, Union, Any, LiteralString
import zipfile

import gdown # pip install gdown
import gdown

from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter

Expand All @@ -17,7 +17,7 @@ class MusiqueQAAdapter(BaseBenchmarkAdapter):

dataset_info = {
# Name of the final file we want to load
"filename": "musique_ans_v1.0_dev.jsonl",
"filename": "data/musique_ans_v1.0_dev.jsonl",
# A Google Drive URL (or share link) to the ZIP containing this file
"download_url": "https://drive.google.com/file/d/1tGdADlNjWFaHLeZZGShh2IRcpO6Lv24h/view?usp=sharing",
# The name of the ZIP archive we expect after downloading
Expand Down Expand Up @@ -51,16 +51,13 @@ def load_corpus(
"Set auto_download=True or manually place the file."
)

# 2. Read the JSONL file
with open(target_filename, "r", encoding="utf-8") as f:
data = [json.loads(line) for line in f]

# 3. (Optional) sample a subset of items
if limit is not None and 0 < limit < len(data):
random.seed(seed)
data = random.sample(data, limit)

# 4. Build up corpus_list and question_answer_pairs
corpus_list = []
question_answer_pairs = []

Expand All @@ -70,10 +67,7 @@ def load_corpus(
combined_paragraphs = " ".join(paragraph["paragraph_text"] for paragraph in paragraphs)
corpus_list.append(combined_paragraphs)

# Example question & answer
# Adjust keys to match your actual JSON structure if needed
question = item.get("question", "")
# If you have a known 'answer' key, or sometimes it's "answer_aliases", adapt accordingly
answer = item.get("answer", "")

question_answer_pairs.append(
Expand All @@ -100,20 +94,15 @@ def _musique_download_file(self) -> None:
return

print(f"Attempting to download from Google Drive: {url}")
# Using gdown to download the ZIP from a Google Drive link
gdown.download(url=url, output=zip_filename, quiet=False, fuzzy=True)

# Unzip the downloaded file
if os.path.exists(zip_filename):
print(f"Unzipping {zip_filename} ...")
with zipfile.ZipFile(zip_filename, "r") as zip_ref:
zip_ref.extractall() # Extract to current directory
# Optionally remove the ZIP after extraction
os.remove(zip_filename)
zip_ref.extractall()
else:
raise FileNotFoundError(f"Failed to download the zip file: {zip_filename}")

# Optional check: ensure the final .jsonl appeared
if not os.path.exists(target_filename):
raise FileNotFoundError(
f"After unzipping, '{target_filename}' not found. "
Expand Down
78 changes: 78 additions & 0 deletions evals/eval_framework/tests/unit/benchmark_adapters_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pytest
import random
from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter
from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
from evals.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter


ADAPTER_CLASSES = [
HotpotQAAdapter,
MusiqueQAAdapter,
DummyAdapter,
TwoWikiMultihopAdapter,
]


@pytest.mark.parametrize("AdapterClass", ADAPTER_CLASSES)
def test_adapter_can_instantiate_and_load(AdapterClass):
"""
Basic smoke test: instantiate each adapter, call load_corpus with no limit,
and ensure it returns the expected tuple of (list, list).
"""
adapter = AdapterClass()
result = adapter.load_corpus()

assert isinstance(result, tuple), f"{AdapterClass.__name__} did not return a tuple."
assert len(result) == 2, f"{AdapterClass.__name__} returned tuple of length != 2."

corpus_list, qa_pairs = result
assert isinstance(corpus_list, list), f"{AdapterClass.__name__} corpus_list is not a list."
assert isinstance(qa_pairs, list), (
f"{AdapterClass.__name__} question_answer_pairs is not a list."
)


@pytest.mark.parametrize("AdapterClass", ADAPTER_CLASSES)
def test_adapter_returns_some_content(AdapterClass):
"""
Verify that the adapter returns some data and that each QA dict
at least has a 'question' and 'answer' key (you can extend or remove as needed).
"""
adapter = AdapterClass()

corpus_list, qa_pairs = adapter.load_corpus(limit=3) # small limit
# We don't know how large the dataset is, but we expect at least 1 item
assert len(corpus_list) > 0, f"{AdapterClass.__name__} returned an empty corpus_list."
assert len(qa_pairs) > 0, f"{AdapterClass.__name__} returned an empty question_answer_pairs."

# Check the shape
assert len(corpus_list) == len(qa_pairs), (
f"{AdapterClass.__name__} corpus_list and question_answer_pairs "
"should typically be the same length. Adjust if your adapter differs."
)

for item in qa_pairs:
assert "question" in item, f"{AdapterClass.__name__} missing 'question' key in QA pair."
assert "answer" in item, f"{AdapterClass.__name__} missing 'answer' key in QA pair."


@pytest.mark.parametrize("AdapterClass", ADAPTER_CLASSES)
def test_adapter_limit(AdapterClass):
"""
Check that the `limit` parameter correctly restricts the amount of data returned.
We'll test with limit=5.
"""
adapter = AdapterClass()

limit = 5
corpus_list, qa_pairs = adapter.load_corpus(limit=limit)

# Confirm that we didn't receive more than 'limit'
# (Some adapters might be allowed to return fewer if the dataset is small)
assert len(corpus_list) <= limit, (
f"{AdapterClass.__name__} returned more items than requested limit={limit}."
)
assert len(qa_pairs) <= limit, (
f"{AdapterClass.__name__} returned more QA items than requested limit={limit}."
)

0 comments on commit e98d51a

Please sign in to comment.