Skip to content

Commit

Permalink
Test: Parse context pieces separately in MusiqueQAAdapter and adjust …
Browse files Browse the repository at this point in the history
…tests [cog-1234] (#561)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **Tests**
- Updated evaluation checks by removing assertions related to the
relationship between `corpus_list` and `qa_pairs`, now focusing solely
on `qa_pairs` limits.

- **Refactor**
- Improved content processing to append each paragraph individually to
`corpus_list`, enhancing clarity in data structure.
- Simplified type annotations in the `load_corpus` method across
multiple adapters, ensuring consistency in return types.

- **Chores**
- Updated dependency installation commands in GitHub Actions workflows
for Python 3.10, 3.11, and 3.12 to include additional evaluation-related
dependencies.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Co-authored-by: Vasilije <[email protected]>
  • Loading branch information
alekszievr and Vasilije1990 authored Feb 20, 2025
1 parent e25c7c9 commit 17231de
Show file tree
Hide file tree
Showing 8 changed files with 13 additions and 22 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test_python_3_10.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
installer-parallel: true

- name: Install dependencies
run: poetry install --no-interaction -E docs
run: poetry install --no-interaction -E docs -E evals
- name: Download NLTK tokenizer data
run: |
poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test_python_3_11.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
installer-parallel: true

- name: Install dependencies
run: poetry install --no-interaction -E docs
run: poetry install --no-interaction -E docs -E evals

- name: Download NLTK tokenizer data
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test_python_3_12.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
installer-parallel: true

- name: Install dependencies
run: poetry install --no-interaction -E docs
run: poetry install --no-interaction -E docs -E evals
- name: Download NLTK tokenizer data
run: |
poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pytest
import random
from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter
from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
Expand Down Expand Up @@ -46,12 +45,6 @@ def test_adapter_returns_some_content(AdapterClass):
assert len(corpus_list) > 0, f"{AdapterClass.__name__} returned an empty corpus_list."
assert len(qa_pairs) > 0, f"{AdapterClass.__name__} returned an empty question_answer_pairs."

# Check the shape
assert len(corpus_list) == len(qa_pairs), (
f"{AdapterClass.__name__} corpus_list and question_answer_pairs "
"should typically be the same length. Adjust if your adapter differs."
)

for item in qa_pairs:
assert "question" in item, f"{AdapterClass.__name__} missing 'question' key in QA pair."
assert "answer" in item, f"{AdapterClass.__name__} missing 'answer' key in QA pair."
Expand All @@ -70,9 +63,7 @@ def test_adapter_limit(AdapterClass):

# Confirm that we didn't receive more than 'limit'
# (Some adapters might be allowed to return fewer if the dataset is small)
assert len(corpus_list) <= limit, (
f"{AdapterClass.__name__} returned more items than requested limit={limit}."
)

assert len(qa_pairs) <= limit, (
f"{AdapterClass.__name__} returned more QA items than requested limit={limit}."
)
4 changes: 2 additions & 2 deletions evals/eval_framework/benchmark_adapters/dummy_adapter.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from typing import Optional, Union, Any, LiteralString
from typing import Optional

from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter


class DummyAdapter(BaseBenchmarkAdapter):
def load_corpus(
self, limit: Optional[int] = None, seed: int = 42
) -> tuple[list[Union[LiteralString, str]], list[dict[str, str]]]:
) -> tuple[list[str], list[dict[str, str]]]:
corpus_list = [
"The cognee is an AI memory engine that supports different vector and graph databases",
"Neo4j is a graph database supported by cognee",
Expand Down
4 changes: 2 additions & 2 deletions evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
import json
import random
from typing import Optional, Union, Any, LiteralString
from typing import Optional, Any
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter


Expand All @@ -16,7 +16,7 @@ class HotpotQAAdapter(BaseBenchmarkAdapter):

def load_corpus(
self, limit: Optional[int] = None, seed: int = 42
) -> tuple[list[Union[LiteralString, str]], list[dict[str, Any]]]:
) -> tuple[list[str], list[dict[str, Any]]]:
filename = self.dataset_info["filename"]

if os.path.exists(filename):
Expand Down
6 changes: 3 additions & 3 deletions evals/eval_framework/benchmark_adapters/musique_adapter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import json
import random
from typing import Optional, Union, Any, LiteralString
from typing import Optional, Any
import zipfile

import gdown
Expand Down Expand Up @@ -64,8 +64,8 @@ def load_corpus(
for item in data:
# Each 'paragraphs' is a list of dicts; we can concatenate their 'paragraph_text'
paragraphs = item.get("paragraphs", [])
combined_paragraphs = " ".join(paragraph["paragraph_text"] for paragraph in paragraphs)
corpus_list.append(combined_paragraphs)
for paragraph in paragraphs:
corpus_list.append(paragraph["paragraph_text"])

question = item.get("question", "")
answer = item.get("answer", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
import json
import random
from typing import Optional, Union, Any, LiteralString
from typing import Optional, Any
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter


Expand All @@ -14,7 +14,7 @@ class TwoWikiMultihopAdapter(BaseBenchmarkAdapter):

def load_corpus(
self, limit: Optional[int] = None, seed: int = 42
) -> tuple[list[Union[LiteralString, str]], list[dict[str, Any]]]:
) -> tuple[list[str], list[dict[str, Any]]]:
filename = self.dataset_info["filename"]

if os.path.exists(filename):
Expand Down

0 comments on commit 17231de

Please sign in to comment.