From e98d51aac9aa975095cf5ce705f06018f0b4757d Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Tue, 18 Feb 2025 10:48:22 -0800 Subject: [PATCH] Add musique adapter base (#525) ## Description ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin ## Summary by CodeRabbit - **Bug Fixes** - Improved data handling by updating the dataset file path and ensuring answers are consistently converted to lowercase for reliable processing. - **Tests** - Introduced unit tests to validate that data adapters instantiate correctly, return non-empty content, and respect specified limits. --------- Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com> Co-authored-by: Boris --- .../benchmark_adapters/musique_adapter.py | 17 +--- .../tests/unit/benchmark_adapters_test.py | 78 +++++++++++++++++++ 2 files changed, 81 insertions(+), 14 deletions(-) create mode 100644 evals/eval_framework/tests/unit/benchmark_adapters_test.py diff --git a/evals/eval_framework/benchmark_adapters/musique_adapter.py b/evals/eval_framework/benchmark_adapters/musique_adapter.py index 982e8e8e8..53858cbb0 100644 --- a/evals/eval_framework/benchmark_adapters/musique_adapter.py +++ b/evals/eval_framework/benchmark_adapters/musique_adapter.py @@ -4,7 +4,7 @@ from typing import Optional, Union, Any, LiteralString import zipfile -import gdown # pip install gdown +import gdown from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter @@ -17,7 +17,7 @@ class MusiqueQAAdapter(BaseBenchmarkAdapter): dataset_info = { # Name of the final file we want to load - "filename": "musique_ans_v1.0_dev.jsonl", + "filename": "data/musique_ans_v1.0_dev.jsonl", # A Google Drive URL (or share link) to the ZIP containing this file "download_url": "https://drive.google.com/file/d/1tGdADlNjWFaHLeZZGShh2IRcpO6Lv24h/view?usp=sharing", # The name of the ZIP archive we expect after downloading @@ -51,16 +51,13 @@ def load_corpus( "Set auto_download=True or manually place the file." ) - # 2. Read the JSONL file with open(target_filename, "r", encoding="utf-8") as f: data = [json.loads(line) for line in f] - # 3. (Optional) sample a subset of items if limit is not None and 0 < limit < len(data): random.seed(seed) data = random.sample(data, limit) - # 4. Build up corpus_list and question_answer_pairs corpus_list = [] question_answer_pairs = [] @@ -70,10 +67,7 @@ def load_corpus( combined_paragraphs = " ".join(paragraph["paragraph_text"] for paragraph in paragraphs) corpus_list.append(combined_paragraphs) - # Example question & answer - # Adjust keys to match your actual JSON structure if needed question = item.get("question", "") - # If you have a known 'answer' key, or sometimes it's "answer_aliases", adapt accordingly answer = item.get("answer", "") question_answer_pairs.append( @@ -100,20 +94,15 @@ def _musique_download_file(self) -> None: return print(f"Attempting to download from Google Drive: {url}") - # Using gdown to download the ZIP from a Google Drive link gdown.download(url=url, output=zip_filename, quiet=False, fuzzy=True) - # Unzip the downloaded file if os.path.exists(zip_filename): print(f"Unzipping {zip_filename} ...") with zipfile.ZipFile(zip_filename, "r") as zip_ref: - zip_ref.extractall() # Extract to current directory - # Optionally remove the ZIP after extraction - os.remove(zip_filename) + zip_ref.extractall() else: raise FileNotFoundError(f"Failed to download the zip file: {zip_filename}") - # Optional check: ensure the final .jsonl appeared if not os.path.exists(target_filename): raise FileNotFoundError( f"After unzipping, '{target_filename}' not found. " diff --git a/evals/eval_framework/tests/unit/benchmark_adapters_test.py b/evals/eval_framework/tests/unit/benchmark_adapters_test.py new file mode 100644 index 000000000..a3e295910 --- /dev/null +++ b/evals/eval_framework/tests/unit/benchmark_adapters_test.py @@ -0,0 +1,78 @@ +import pytest +import random +from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter +from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter +from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter +from evals.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter + + +ADAPTER_CLASSES = [ + HotpotQAAdapter, + MusiqueQAAdapter, + DummyAdapter, + TwoWikiMultihopAdapter, +] + + +@pytest.mark.parametrize("AdapterClass", ADAPTER_CLASSES) +def test_adapter_can_instantiate_and_load(AdapterClass): + """ + Basic smoke test: instantiate each adapter, call load_corpus with no limit, + and ensure it returns the expected tuple of (list, list). + """ + adapter = AdapterClass() + result = adapter.load_corpus() + + assert isinstance(result, tuple), f"{AdapterClass.__name__} did not return a tuple." + assert len(result) == 2, f"{AdapterClass.__name__} returned tuple of length != 2." + + corpus_list, qa_pairs = result + assert isinstance(corpus_list, list), f"{AdapterClass.__name__} corpus_list is not a list." + assert isinstance(qa_pairs, list), ( + f"{AdapterClass.__name__} question_answer_pairs is not a list." + ) + + +@pytest.mark.parametrize("AdapterClass", ADAPTER_CLASSES) +def test_adapter_returns_some_content(AdapterClass): + """ + Verify that the adapter returns some data and that each QA dict + at least has a 'question' and 'answer' key (you can extend or remove as needed). + """ + adapter = AdapterClass() + + corpus_list, qa_pairs = adapter.load_corpus(limit=3) # small limit + # We don't know how large the dataset is, but we expect at least 1 item + assert len(corpus_list) > 0, f"{AdapterClass.__name__} returned an empty corpus_list." + assert len(qa_pairs) > 0, f"{AdapterClass.__name__} returned an empty question_answer_pairs." + + # Check the shape + assert len(corpus_list) == len(qa_pairs), ( + f"{AdapterClass.__name__} corpus_list and question_answer_pairs " + "should typically be the same length. Adjust if your adapter differs." + ) + + for item in qa_pairs: + assert "question" in item, f"{AdapterClass.__name__} missing 'question' key in QA pair." + assert "answer" in item, f"{AdapterClass.__name__} missing 'answer' key in QA pair." + + +@pytest.mark.parametrize("AdapterClass", ADAPTER_CLASSES) +def test_adapter_limit(AdapterClass): + """ + Check that the `limit` parameter correctly restricts the amount of data returned. + We'll test with limit=5. + """ + adapter = AdapterClass() + + limit = 5 + corpus_list, qa_pairs = adapter.load_corpus(limit=limit) + + # Confirm that we didn't receive more than 'limit' + # (Some adapters might be allowed to return fewer if the dataset is small) + assert len(corpus_list) <= limit, ( + f"{AdapterClass.__name__} returned more items than requested limit={limit}." + ) + assert len(qa_pairs) <= limit, ( + f"{AdapterClass.__name__} returned more QA items than requested limit={limit}." + )