diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index c5181d0ab..99e391c24 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json from collections.abc import Sequence from dataclasses import dataclass from typing import Annotated @@ -918,3 +919,97 @@ def load_results( reference=None, citation=None, ) + + +MAIR_TASK_CONFIG = json.loads( + '{"Competition-Math": "Academic", "ProofWiki_Proof": "Academic", "ProofWiki_Reference": "Academic", "Stacks_Proof": "Academic", "Stacks_Reference": "Academic", "Stein_Proof": "Academic", "Stein_Reference": "Academic", "Trench_Proof": "Academic", "Trench_Reference": "Academic", "TAD": "Academic", "TAS2": "Academic", "StackMathQA": "Academic", "APPS": "Code", "CodeEditSearch": "Code", "CodeSearchNet": "Code", "Conala": "Code", "HumanEval-X": "Code", "LeetCode": "Code", "MBPP": "Code", "RepoBench": "Code", "TLDR": "Code", "SWE-Bench-Lite": "Code", "Apple": "Finance", "ConvFinQA": "Finance", "FinQA": "Finance", "FinanceBench": "Finance", "HC3Finance": "Finance", "TAT-DQA": "Finance", "Trade-the-event": "Finance", "AY2": "Web", "ELI5": "Web", "Fever": "Web", "TREx": "Web", "WnCw": "Web", "WnWi": "Web", "WoW": "Web", "zsRE": "Web", "AILA2019-Case": "Legal", "AILA2019-Statutes": "Legal", "BSARD": "Legal", "BillSum": "Legal", "CUAD": "Legal", "GerDaLIR": "Legal", "LeCaRDv2": "Legal", "LegalQuAD": "Legal", "REGIR-EU2UK": "Legal", "REGIR-UK2EU": "Legal", "ArguAna": "Web", "CQADupStack": "Web", "FiQA": "Finance", "NFCorpus": "Medical", "Quora": "Web", "SciDocs": "Academic", "SciFact": "Academic", "TopiOCQA": "Web", "Touche": "Web", "Trec-Covid": "Medical", "ACORDAR": "Web", "CPCD": "Web", "ChroniclingAmericaQA": "Web", "Monant": "Medical", "NTCIR": "Web", "PointRec": "Web", "ProCIS-Dialog": "Web", "ProCIS-Turn": "Web", "QuanTemp": "Web", "WebTableSearch": "Web", "CARE": "Medical", "MISeD": "Web", "SParC": "Web", "SParC-SQL": "Web", "Spider": "Web", "Spider-SQL": "Web", "LitSearch": "Academic", "CAsT_2019": "Web", "CAsT_2020": "Web", "CAsT_2021": "Web", "CAsT_2022": "Web", "Core_2017": "Web", "Microblog_2011": "Web", "Microblog_2012": "Web", "Microblog_2013": "Web", "Microblog_2014": "Web", "PrecisionMedicine_2017": "Medical", "PrecisionMedicine_2018": "Medical", "PrecisionMedicine_2019": "Medical", "PrecisionMedicine-Article_2019": "Medical", "PrecisionMedicine-Article_2020": "Medical", "CliniDS_2014": "Medical", "CliniDS_2015": "Medical", "CliniDS_2016": "Medical", "ClinicalTrials_2021": "Medical", "ClinicalTrials_2022": "Medical", "ClinicalTrials_2023": "Medical", "DD_2015": "Web", "DD_2016": "Web", "DD_2017": "Web", "FairRanking_2020": "Academic", "FairRanking_2021": "Web", "FairRanking_2022": "Web", "Genomics-AdHoc_2004": "Medical", "Genomics-AdHoc_2005": "Medical", "Genomics-AdHoc_2006": "Medical", "Genomics-AdHoc_2007": "Medical", "TREC-Legal_2011": "Legal", "NeuCLIR-Tech_2023": "Web", "NeuCLIR_2022": "Web", "NeuCLIR_2023": "Web", "ProductSearch_2023": "Web", "ToT_2023": "Web", "ToT_2024": "Web", "FoodAPI": "Code", "HuggingfaceAPI": "Code", "PytorchAPI": "Code", "SpotifyAPI": "Code", "TMDB": "Code", "TensorAPI": "Code", "ToolBench": "Code", "WeatherAPI": "Code", "ExcluIR": "Web", "Core17": "Web", "News21": "Web", "Robust04": "Web", "InstructIR": "Web", "NevIR": "Web", "IFEval": "Web"}' +) + + +def _get_mair_tasks_by_domain(domain): + assert domain in ["Academic", "Code", "Web", "Legal", "Medical", "Finance"] + out = [] + for task in MAIR_TASK_CONFIG: + if MAIR_TASK_CONFIG[task] == domain: + out.append(task) + return out + + +def _get_mair_all_tasks(): + return list(MAIR_TASK_CONFIG.keys()) + + +_MAIR_CITATION = """@inproceedings{Sun2024MAIR, + title={MAIR: A Massive Benchmark for Evaluating Instructed Retrieval}, + author={Weiwei Sun and Zhengliang Shi and Jiulong Wu and Lingyong Yan and Xinyu Ma and Yiding Liu and Min Cao and Dawei Yin and Zhaochun Ren}, + booktitle={EMNLP}, + year={2024}, +}""" + +MAIR = Benchmark( + name="MAIR", + tasks=get_tasks(tasks=["MAIR-" + name for name in _get_mair_all_tasks()]), + description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", + reference="https://github.com/sunnweiwei/MAIR", + citation=_MAIR_CITATION, +) + +MAIR_WEB = Benchmark( + name="MAIR(Web)", + tasks=get_tasks( + tasks=["MAIR-" + name for name in _get_mair_tasks_by_domain("Web")] + ), + description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", + reference="https://github.com/sunnweiwei/MAIR", + citation=_MAIR_CITATION, +) + +MAIR_CODE = Benchmark( + name="MAIR(Code)", + tasks=get_tasks( + tasks=["MAIR-" + name for name in _get_mair_tasks_by_domain("Code")] + ), + description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", + reference="https://github.com/sunnweiwei/MAIR", + citation=_MAIR_CITATION, +) + +MAIR_ACADEMIC = Benchmark( + name="MAIR(Academic)", + tasks=get_tasks( + tasks=["MAIR-" + name for name in _get_mair_tasks_by_domain("Academic")] + ), + description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", + reference="https://github.com/sunnweiwei/MAIR", + citation=_MAIR_CITATION, +) + +MAIR_LEGAL = Benchmark( + name="MAIR(Legal)", + tasks=get_tasks( + tasks=["MAIR-" + name for name in _get_mair_tasks_by_domain("Legal")] + ), + description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", + reference="https://github.com/sunnweiwei/MAIR", + citation=_MAIR_CITATION, +) + +MAIR_MEDICAL = Benchmark( + name="MAIR(Medical)", + tasks=get_tasks( + tasks=["MAIR-" + name for name in _get_mair_tasks_by_domain("Medical")] + ), + description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", + reference="https://github.com/sunnweiwei/MAIR", + citation=_MAIR_CITATION, +) + +MAIR_FINANCE = Benchmark( + name="MAIR(Finance)", + tasks=get_tasks( + tasks=["MAIR-" + name for name in _get_mair_tasks_by_domain("Finance")] + ), + description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", + reference="https://github.com/sunnweiwei/MAIR", + citation=_MAIR_CITATION, +) diff --git a/mteb/tasks/MAIR/__init__.py b/mteb/tasks/MAIR/__init__.py new file mode 100644 index 000000000..f1598f4c5 --- /dev/null +++ b/mteb/tasks/MAIR/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from .eng.MAIR import * diff --git a/mteb/tasks/MAIR/eng/MAIR.py b/mteb/tasks/MAIR/eng/MAIR.py new file mode 100644 index 000000000..fa6667af9 --- /dev/null +++ b/mteb/tasks/MAIR/eng/MAIR.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import json + +import datasets + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + +__all__ = [] + + +TASK2SPLIT = json.loads( + '{"Competition-Math": ["queries"], "ProofWiki_Proof": ["queries"], "ProofWiki_Reference": ["queries"], "Stacks_Proof": ["queries"], "Stacks_Reference": ["queries"], "Stein_Proof": ["queries"], "Stein_Reference": ["queries"], "Trench_Proof": ["queries"], "Trench_Reference": ["queries"], "TAD": ["queries"], "TAS2": ["queries"], "StackMathQA": ["queries"], "APPS": ["queries"], "CodeEditSearch": ["queries"], "CodeSearchNet": ["queries"], "Conala": ["queries"], "HumanEval-X": ["queries"], "LeetCode": ["queries"], "MBPP": ["queries"], "RepoBench": ["queries"], "TLDR": ["queries"], "SWE-Bench-Lite": ["astropy__astropy_12544_queries", "astropy__astropy_13158_queries", "astropy__astropy_13162_queries", "astropy__astropy_13398_queries", "astropy__astropy_13438_queries", "astropy__astropy_14439_queries", "astropy__astropy_14701_queries", "astropy__astropy_14966_queries", "astropy__astropy_7441_queries", "astropy__astropy_8707_queries", "django__django_11501_queries", "django__django_12091_queries", "django__django_13192_queries", "django__django_13218_queries", "django__django_13884_queries", "django__django_14441_queries", "django__django_15481_queries", "django__django_15869_queries", "django__django_16901_queries", "django__django_17065_queries", "matplotlib__matplotlib_20518_queries", "matplotlib__matplotlib_23314_queries", "matplotlib__matplotlib_23913_queries", "matplotlib__matplotlib_24627_queries", "matplotlib__matplotlib_24849_queries", "matplotlib__matplotlib_25027_queries", "matplotlib__matplotlib_25238_queries", "matplotlib__matplotlib_25404_queries", "matplotlib__matplotlib_25430_queries", "matplotlib__matplotlib_25746_queries", "mwaskom__seaborn_2389_queries", "mwaskom__seaborn_2576_queries", "mwaskom__seaborn_2766_queries", "mwaskom__seaborn_2813_queries", "mwaskom__seaborn_2853_queries", "mwaskom__seaborn_2946_queries", "mwaskom__seaborn_2979_queries", "mwaskom__seaborn_2996_queries", "mwaskom__seaborn_3202_queries", "mwaskom__seaborn_3407_queries", "pallets__flask_4045_queries", "pallets__flask_4074_queries", "pallets__flask_4160_queries", "pallets__flask_4169_queries", "pallets__flask_4544_queries", "pallets__flask_4575_queries", "pallets__flask_4642_queries", "pallets__flask_4992_queries", "pallets__flask_5014_queries", "pallets__flask_5063_queries", "psf__requests_1537_queries", "psf__requests_1713_queries", "psf__requests_1733_queries", "psf__requests_1766_queries", "psf__requests_2193_queries", "psf__requests_2466_queries", "psf__requests_2821_queries", "psf__requests_3362_queries", "psf__requests_5414_queries", "psf__requests_863_queries", "pydata__xarray_4339_queries", "pydata__xarray_4767_queries", "pydata__xarray_4827_queries", "pydata__xarray_4911_queries", "pydata__xarray_4966_queries", "pydata__xarray_5033_queries", "pydata__xarray_5682_queries", "pydata__xarray_6135_queries", "pydata__xarray_6461_queries", "pydata__xarray_7391_queries", "pylint_dev__pylint_4398_queries", "pylint_dev__pylint_4604_queries", "pylint_dev__pylint_5175_queries", "pylint_dev__pylint_5446_queries", "pylint_dev__pylint_5613_queries", "pylint_dev__pylint_6358_queries", "pylint_dev__pylint_6412_queries", "pylint_dev__pylint_6556_queries", "pylint_dev__pylint_8281_queries", "pylint_dev__pylint_8757_queries", "pytest_dev__pytest_10371_queries", "pytest_dev__pytest_11047_queries", "pytest_dev__pytest_11148_queries", "pytest_dev__pytest_5356_queries", "pytest_dev__pytest_6680_queries", "pytest_dev__pytest_7158_queries", "pytest_dev__pytest_7352_queries", "pytest_dev__pytest_9064_queries", "pytest_dev__pytest_9279_queries", "scikit_learn__scikit_learn_10198_queries", "scikit_learn__scikit_learn_10803_queries", "scikit_learn__scikit_learn_10949_queries", "scikit_learn__scikit_learn_11333_queries", "scikit_learn__scikit_learn_11635_queries", "scikit_learn__scikit_learn_12827_queries", "scikit_learn__scikit_learn_12834_queries", "scikit_learn__scikit_learn_13302_queries", "scikit_learn__scikit_learn_13392_queries", "scikit_learn__scikit_learn_13779_queries", "sphinx_doc__sphinx_11312_queries", "sphinx_doc__sphinx_11502_queries", "sphinx_doc__sphinx_7356_queries", "sphinx_doc__sphinx_7590_queries", "sphinx_doc__sphinx_7757_queries", "sphinx_doc__sphinx_7831_queries", "sphinx_doc__sphinx_8125_queries", "sphinx_doc__sphinx_8863_queries", "sphinx_doc__sphinx_9309_queries", "sphinx_doc__sphinx_9828_queries", "sympy__sympy_13091_queries", "sympy__sympy_14817_queries", "sympy__sympy_14821_queries", "sympy__sympy_15151_queries", "sympy__sympy_15933_queries", "sympy__sympy_16493_queries", "sympy__sympy_16858_queries", "sympy__sympy_17251_queries", "sympy__sympy_18532_queries", "sympy__sympy_20212_queries"], "Apple": ["queries"], "ConvFinQA": ["queries"], "FinQA": ["queries"], "FinanceBench": ["queries"], "HC3Finance": ["queries"], "TAT-DQA": ["queries"], "Trade-the-event": ["queries"], "AY2": ["queries"], "ELI5": ["queries"], "Fever": ["queries"], "TREx": ["queries"], "WnCw": ["queries"], "WnWi": ["queries"], "WoW": ["queries"], "zsRE": ["queries"], "AILA2019-Case": ["queries"], "AILA2019-Statutes": ["queries"], "BSARD": ["queries"], "BillSum": ["queries"], "CUAD": ["GOOSEHEADINSURANCE_queries", "GRANTIERRAENERGY_queries", "HarpoonTherapeutics_queries", "Monsanto_Company_queries"], "GerDaLIR": ["queries"], "LeCaRDv2": ["queries"], "LegalQuAD": ["queries"], "REGIR-EU2UK": ["queries"], "REGIR-UK2EU": ["queries"], "ArguAna": ["queries"], "CQADupStack": ["CQADupStack_Android_queries", "CQADupStack_English_queries", "CQADupStack_Gaming_queries", "CQADupStack_Gis_queries", "CQADupStack_Math_queries", "CQADupStack_Physics_queries", "CQADupStack_Programmers_queries", "CQADupStack_Stats_queries", "CQADupStack_Tex_queries", "CQADupStack_Unix_queries", "CQADupStack_WebMasters_queries", "CQADupStack_Wordpress_queries"], "FiQA": ["queries"], "NFCorpus": ["queries"], "Quora": ["queries"], "SciDocs": ["queries"], "SciFact": ["queries"], "TopiOCQA": ["queries"], "Touche": ["queries"], "Trec-Covid": ["queries"], "ACORDAR": ["queries"], "CPCD": ["queries"], "ChroniclingAmericaQA": ["queries"], "Monant": ["queries"], "NTCIR": ["queries"], "PointRec": ["queries"], "ProCIS-Dialog": ["queries"], "ProCIS-Turn": ["queries"], "QuanTemp": ["queries"], "WebTableSearch": ["queries"], "CARE": ["queries"], "MISeD": ["Bmr006_queries", "Bro027_queries", "covid4_queries", "covid9_queries", "education4_queries"], "SParC": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "SParC-SQL": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "Spider": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "Spider-SQL": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "LitSearch": ["queries"], "CAsT_2019": ["queries"], "CAsT_2020": ["queries"], "CAsT_2021": ["queries"], "CAsT_2022": ["queries"], "Core_2017": ["queries"], "Microblog_2011": ["queries"], "Microblog_2012": ["queries"], "Microblog_2013": ["queries"], "Microblog_2014": ["queries"], "PrecisionMedicine_2017": ["queries"], "PrecisionMedicine_2018": ["queries"], "PrecisionMedicine_2019": ["queries"], "PrecisionMedicine-Article_2019": ["queries"], "PrecisionMedicine-Article_2020": ["queries"], "CliniDS_2014": ["queries"], "CliniDS_2015": ["queries"], "CliniDS_2016": ["queries"], "ClinicalTrials_2021": ["queries"], "ClinicalTrials_2022": ["queries"], "ClinicalTrials_2023": ["queries"], "DD_2015": ["queries"], "DD_2016": ["queries"], "DD_2017": ["queries"], "FairRanking_2020": ["queries"], "FairRanking_2021": ["queries"], "FairRanking_2022": ["queries"], "Genomics-AdHoc_2004": ["queries"], "Genomics-AdHoc_2005": ["queries"], "Genomics-AdHoc_2006": ["queries"], "Genomics-AdHoc_2007": ["queries"], "TREC-Legal_2011": ["queries"], "NeuCLIR-Tech_2023": ["queries"], "NeuCLIR_2022": ["queries"], "NeuCLIR_2023": ["queries"], "ProductSearch_2023": ["queries"], "ToT_2023": ["queries"], "ToT_2024": ["queries"], "FoodAPI": ["queries"], "HuggingfaceAPI": ["queries"], "PytorchAPI": ["queries"], "SpotifyAPI": ["queries"], "TMDB": ["queries"], "TensorAPI": ["queries"], "ToolBench": ["queries"], "WeatherAPI": ["queries"], "ExcluIR": ["queries"], "Core17": ["queries"], "News21": ["queries"], "Robust04": ["queries"], "InstructIR": ["queries"], "NevIR": ["queries"], "IFEval": ["detectable_format__number_bullet_lists_2078_queries", "detectable_format__number_bullet_lists_102_queries", "detectable_format__number_bullet_lists_2195_queries", "detectable_format__number_bullet_lists_2314_queries", "detectable_format__number_bullet_lists_1934_queries", "detectable_format__number_bullet_lists_2667_queries", "detectable_format__number_bullet_lists_1634_queries", "detectable_format__number_bullet_lists_2100_queries", "detectable_format__number_bullet_lists_1286_queries", "detectable_format__number_bullet_lists_2457_queries", "keywords__letter_frequency_1130_queries", "keywords__letter_frequency_2107_queries", "keywords__letter_frequency_1964_queries", "keywords__letter_frequency_2265_queries", "detectable_format__constrained_response_3752_queries", "detectable_format__constrained_response_3755_queries", "detectable_format__constrained_response_3754_queries", "detectable_format__constrained_response_3753_queries", "detectable_format__constrained_response_227_queries", "detectable_format__constrained_response_3749_queries", "detectable_format__constrained_response_3756_queries", "detectable_format__constrained_response_3751_queries", "detectable_format__constrained_response_3750_queries", "detectable_format__constrained_response_3757_queries", "punctuation__no_comma_2245_queries", "punctuation__no_comma_1107_queries", "punctuation__no_comma_1162_queries", "punctuation__no_comma_1418_queries", "punctuation__no_comma_1001_queries", "punctuation__no_comma_1187_queries", "punctuation__no_comma_1738_queries", "punctuation__no_comma_1300_queries", "punctuation__no_comma_2069_queries", "punctuation__no_comma_1643_queries", "keywords__existence_3156_queries", "keywords__existence_2485_queries", "keywords__existence_1531_queries", "keywords__existence_3732_queries", "keywords__existence_2662_queries", "change_case__english_capital_2341_queries", "change_case__english_capital_3186_queries", "change_case__english_capital_2563_queries", "change_case__english_capital_1999_queries", "change_case__english_capital_24_queries", "change_case__english_capital_1645_queries", "change_case__english_lowercase_1122_queries", "change_case__english_lowercase_1361_queries", "change_case__english_lowercase_1019_queries", "change_case__english_lowercase_1087_queries", "change_case__english_lowercase_1667_queries", "change_case__english_lowercase_1516_queries", "change_case__english_lowercase_1535_queries", "change_case__english_lowercase_1593_queries", "change_case__english_lowercase_1843_queries", "keywords__frequency_1393_queries", "keywords__frequency_1733_queries", "keywords__frequency_2142_queries", "keywords__frequency_2292_queries", "keywords__frequency_1498_queries", "keywords__frequency_1203_queries", "keywords__frequency_1857_queries", "length_constraints__number_sentences_1837_queries", "length_constraints__number_sentences_2674_queries", "length_constraints__number_sentences_2617_queries", "length_constraints__number_sentences_1381_queries", "length_constraints__number_sentences_2266_queries", "length_constraints__number_sentences_1268_queries", "length_constraints__number_sentences_179_queries", "length_constraints__number_paragraphs_1236_queries", "length_constraints__number_paragraphs_2941_queries", "length_constraints__number_paragraphs_1248_queries", "length_constraints__number_paragraphs_1858_queries", "length_constraints__number_paragraphs_1377_queries", "length_constraints__number_paragraphs_2357_queries", "length_constraints__number_paragraphs_2921_queries", "length_constraints__number_paragraphs_1082_queries", "length_constraints__number_paragraphs_2467_queries", "combination__two_responses_1591_queries", "combination__two_responses_1793_queries", "combination__two_responses_2912_queries", "combination__two_responses_1332_queries", "combination__two_responses_2383_queries", "combination__two_responses_136_queries", "combination__two_responses_1098_queries", "combination__two_responses_1746_queries", "combination__two_responses_247_queries", "combination__two_responses_2918_queries", "detectable_content__postscript_2273_queries", "detectable_content__postscript_2070_queries", "detectable_content__postscript_1800_queries", "detectable_content__postscript_1305_queries", "detectable_content__postscript_1759_queries", "detectable_content__postscript_1367_queries", "detectable_content__postscript_1537_queries", "detectable_content__postscript_1879_queries", "detectable_content__postscript_1246_queries", "detectable_content__postscript_1620_queries", "startend__end_checker_2398_queries", "startend__end_checker_1902_queries", "startend__end_checker_2268_queries", "startend__end_checker_1659_queries", "startend__end_checker_1893_queries", "startend__end_checker_2475_queries", "startend__end_checker_1128_queries", "startend__end_checker_1939_queries", "startend__end_checker_1446_queries", "startend__end_checker_1220_queries", "detectable_content__number_placeholders_3280_queries", "detectable_content__number_placeholders_1372_queries", "detectable_content__number_placeholders_3221_queries", "detectable_content__number_placeholders_1927_queries", "detectable_content__number_placeholders_3126_queries", "detectable_content__number_placeholders_2164_queries", "detectable_content__number_placeholders_2136_queries", "detectable_content__number_placeholders_2304_queries", "detectable_content__number_placeholders_3743_queries", "length_constraints__number_words_2323_queries", "length_constraints__number_words_1072_queries", "length_constraints__number_words_1258_queries", "length_constraints__number_words_1251_queries", "length_constraints__number_words_164_queries", "detectable_format__number_highlighted_sections_168_queries", "detectable_format__number_highlighted_sections_1237_queries", "detectable_format__number_highlighted_sections_1601_queries", "detectable_format__number_highlighted_sections_167_queries", "detectable_format__number_highlighted_sections_1773_queries", "detectable_format__number_highlighted_sections_1646_queries", "detectable_format__number_highlighted_sections_1379_queries", "detectable_format__number_highlighted_sections_1307_queries", "detectable_format__number_highlighted_sections_1886_queries", "detectable_format__number_highlighted_sections_1644_queries", "detectable_format__json_format_1094_queries", "detectable_format__json_format_1148_queries", "detectable_format__json_format_1137_queries", "detectable_format__json_format_1075_queries", "detectable_format__json_format_2857_queries", "detectable_format__json_format_3223_queries", "detectable_format__json_format_2404_queries", "detectable_format__json_format_321_queries", "detectable_format__json_format_13_queries", "change_case__capital_word_frequency_2820_queries", "change_case__capital_word_frequency_2849_queries", "change_case__capital_word_frequency_2870_queries", "change_case__capital_word_frequency_1592_queries", "detectable_format__multiple_sections_2023_queries", "detectable_format__multiple_sections_1548_queries", "detectable_format__multiple_sections_2925_queries", "detectable_format__multiple_sections_1131_queries", "detectable_format__multiple_sections_357_queries", "startend__quotation_2015_queries", "startend__quotation_219_queries", "startend__quotation_2010_queries", "startend__quotation_1658_queries", "startend__quotation_1325_queries", "startend__quotation_1776_queries", "startend__quotation_2239_queries", "startend__quotation_1845_queries", "startend__quotation_2209_queries", "length_constraints__nth_paragraph_first_word_2880_queries", "length_constraints__nth_paragraph_first_word_181_queries", "length_constraints__nth_paragraph_first_word_2250_queries", "length_constraints__nth_paragraph_first_word_2215_queries", "length_constraints__nth_paragraph_first_word_3073_queries", "length_constraints__nth_paragraph_first_word_2590_queries", "length_constraints__nth_paragraph_first_word_3624_queries", "length_constraints__nth_paragraph_first_word_1954_queries", "detectable_format__title_1262_queries", "detectable_format__title_2229_queries", "detectable_format__title_295_queries", "detectable_format__title_2097_queries", "detectable_format__title_1802_queries", "detectable_format__title_1322_queries", "detectable_format__title_2969_queries", "detectable_format__title_3057_queries", "detectable_format__title_1551_queries", "detectable_format__title_2807_queries"]}' +) + +MAIR_TASK_CONFIG = json.loads( + '{"Competition-Math": "Academic", "ProofWiki_Proof": "Academic", "ProofWiki_Reference": "Academic", "Stacks_Proof": "Academic", "Stacks_Reference": "Academic", "Stein_Proof": "Academic", "Stein_Reference": "Academic", "Trench_Proof": "Academic", "Trench_Reference": "Academic", "TAD": "Academic", "TAS2": "Academic", "StackMathQA": "Academic", "APPS": "Code", "CodeEditSearch": "Code", "CodeSearchNet": "Code", "Conala": "Code", "HumanEval-X": "Code", "LeetCode": "Code", "MBPP": "Code", "RepoBench": "Code", "TLDR": "Code", "SWE-Bench-Lite": "Code", "Apple": "Finance", "ConvFinQA": "Finance", "FinQA": "Finance", "FinanceBench": "Finance", "HC3Finance": "Finance", "TAT-DQA": "Finance", "Trade-the-event": "Finance", "AY2": "Web", "ELI5": "Web", "Fever": "Web", "TREx": "Web", "WnCw": "Web", "WnWi": "Web", "WoW": "Web", "zsRE": "Web", "AILA2019-Case": "Legal", "AILA2019-Statutes": "Legal", "BSARD": "Legal", "BillSum": "Legal", "CUAD": "Legal", "GerDaLIR": "Legal", "LeCaRDv2": "Legal", "LegalQuAD": "Legal", "REGIR-EU2UK": "Legal", "REGIR-UK2EU": "Legal", "ArguAna": "Web", "CQADupStack": "Web", "FiQA": "Finance", "NFCorpus": "Medical", "Quora": "Web", "SciDocs": "Academic", "SciFact": "Academic", "TopiOCQA": "Web", "Touche": "Web", "Trec-Covid": "Medical", "ACORDAR": "Web", "CPCD": "Web", "ChroniclingAmericaQA": "Web", "Monant": "Medical", "NTCIR": "Web", "PointRec": "Web", "ProCIS-Dialog": "Web", "ProCIS-Turn": "Web", "QuanTemp": "Web", "WebTableSearch": "Web", "CARE": "Medical", "MISeD": "Web", "SParC": "Web", "SParC-SQL": "Web", "Spider": "Web", "Spider-SQL": "Web", "LitSearch": "Academic", "CAsT_2019": "Web", "CAsT_2020": "Web", "CAsT_2021": "Web", "CAsT_2022": "Web", "Core_2017": "Web", "Microblog_2011": "Web", "Microblog_2012": "Web", "Microblog_2013": "Web", "Microblog_2014": "Web", "PrecisionMedicine_2017": "Medical", "PrecisionMedicine_2018": "Medical", "PrecisionMedicine_2019": "Medical", "PrecisionMedicine-Article_2019": "Medical", "PrecisionMedicine-Article_2020": "Medical", "CliniDS_2014": "Medical", "CliniDS_2015": "Medical", "CliniDS_2016": "Medical", "ClinicalTrials_2021": "Medical", "ClinicalTrials_2022": "Medical", "ClinicalTrials_2023": "Medical", "DD_2015": "Web", "DD_2016": "Web", "DD_2017": "Web", "FairRanking_2020": "Academic", "FairRanking_2021": "Web", "FairRanking_2022": "Web", "Genomics-AdHoc_2004": "Medical", "Genomics-AdHoc_2005": "Medical", "Genomics-AdHoc_2006": "Medical", "Genomics-AdHoc_2007": "Medical", "TREC-Legal_2011": "Legal", "NeuCLIR-Tech_2023": "Web", "NeuCLIR_2022": "Web", "NeuCLIR_2023": "Web", "ProductSearch_2023": "Web", "ToT_2023": "Web", "ToT_2024": "Web", "FoodAPI": "Code", "HuggingfaceAPI": "Code", "PytorchAPI": "Code", "SpotifyAPI": "Code", "TMDB": "Code", "TensorAPI": "Code", "ToolBench": "Code", "WeatherAPI": "Code", "ExcluIR": "Web", "Core17": "Web", "News21": "Web", "Robust04": "Web", "InstructIR": "Web", "NevIR": "Web", "IFEval": "Web"}' +) + +_MAIR_CITATION = """@inproceedings{Sun2024MAIR, + title={MAIR: A Massive Benchmark for Evaluating Instructed Retrieval}, + author={Weiwei Sun and Zhengliang Shi and Jiulong Wu and Lingyong Yan and Xinyu Ma and Yiding Liu and Min Cao and Dawei Yin and Zhaochun Ren}, + booktitle={EMNLP}, + year={2024}, +}""" + + +def get_metadata(task_name): + return TaskMetadata( + name="MAIR-" + task_name, + description="Recent information retrieval (IR) models are pre-trained and instruction-tuned on massive datasets and tasks, enabling them to perform well on a wide range of tasks and potentially generalize to unseen tasks with instructions. However, existing IR benchmarks focus on a limited scope of tasks, making them insufficient for evaluating the latest IR models. In this paper, we propose MAIR (Massive Instructed Retrieval Benchmark), a heterogeneous IR benchmark that includes 126 distinct IR tasks across 6 domains, collected from existing datasets. We benchmark state-of-the-art instruction-tuned text embedding models and re-ranking models. Our experiments reveal that instruction-tuned models generally achieve superior performance compared to non-instruction-tuned models on MAIR. Additionally, our results suggest that current instruction-tuned text embedding models and re-ranking models still lack effectiveness in specific long-tail tasks.", + reference="https://github.com/sunnweiwei/MAIR", + dataset={ + "path": "MAIR-Bench/MAIR-Queries", + "revision": "7d24eac886a6ae6653a6b67433e1c302cb0e9ac6", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=TASK2SPLIT.get(task_name, []), + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2023-07-10", "2024-11-10"), + domains=["Web"], + task_subtypes=["Question answering"], + license="mit", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", # queries LLM generated, corpus samples are found (extracted from S2ORC) + bibtex_citation=_MAIR_CITATION, + ) + + +def load_data(self, **kwargs): + if self.data_loaded: + return + self.corpus, self.queries, self.relevant_docs = {}, {}, {} + queries_path = self.metadata_dict["dataset"]["path"] + docs_path = self.metadata_dict["dataset"]["path"].replace("-Queries", "-Docs") + task_name = self.metadata.name.replace("MAIR-", "") + query_ds = datasets.load_dataset(queries_path, task_name) + corpus_ds = datasets.load_dataset(docs_path, task_name) + self.metadata.eval_splits = [] + for split in query_ds: + doc_split = "docs" if split == "queries" else split.replace("_queries", "_docs") + self.queries[split] = {item["qid"]: item["query"] for item in query_ds[split]} + self.corpus[split] = { + item["id"]: {"title": "", "text": item["doc"]} + for item in corpus_ds[doc_split] + } + self.relevant_docs[split] = { + item["qid"]: {d["id"]: d["score"] for d in item["labels"]} + for item in query_ds[split] + } + + self.data_loaded = True + + +for _task in TASK2SPLIT.keys(): + _class_name = _task.replace("-", "_") + _new_class = type( + _class_name, + (AbsTaskRetrieval,), + {"metadata": get_metadata(_task), "load_data": load_data}, + ) + globals()[_class_name] = _new_class + __all__.append(_class_name) diff --git a/mteb/tasks/MAIR/eng/__init__.py b/mteb/tasks/MAIR/eng/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/mteb/tasks/MAIR/eng/__init__.py @@ -0,0 +1 @@ + diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index dfe568bb8..2b0255fc3 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -11,3 +11,4 @@ from .SpeedTask import * from .STS import * from .Summarization import * +from .MAIR import *