From e721ae28b8ff174625bd74714c7504abb4458127 Mon Sep 17 00:00:00 2001 From: Weiwei Sun <68775773+sunnweiwei@users.noreply.github.com> Date: Sun, 10 Nov 2024 01:09:39 -0500 Subject: [PATCH 01/11] Create MAIR.py --- mteb/tasks/MAIR/eng/MAIR.py | 83 +++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 mteb/tasks/MAIR/eng/MAIR.py diff --git a/mteb/tasks/MAIR/eng/MAIR.py b/mteb/tasks/MAIR/eng/MAIR.py new file mode 100644 index 000000000..2baa25648 --- /dev/null +++ b/mteb/tasks/MAIR/eng/MAIR.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +import datasets +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + +__all__ = [] + + +TASK2SPLIT = {'Competition-Math': ['queries'], 'ProofWiki_Proof': ['queries'], 'ProofWiki_Reference': ['queries'], 'Stacks_Proof': ['queries'], 'Stacks_Reference': ['queries'], 'Stein_Proof': ['queries'], 'Stein_Reference': ['queries'], 'Trench_Proof': ['queries'], 'Trench_Reference': ['queries'], 'TAD': ['queries'], 'TAS2': ['queries'], 'StackMathQA': ['queries'], 'APPS': ['queries'], 'CodeEditSearch': ['queries'], 'CodeSearchNet': ['queries'], 'Conala': ['queries'], 'HumanEval-X': ['queries'], 'LeetCode': ['queries'], 'MBPP': ['queries'], 'RepoBench': ['queries'], 'TLDR': ['queries'], 'SWE-Bench-Lite': ['astropy__astropy_12544_queries', 'astropy__astropy_13158_queries', 'astropy__astropy_13162_queries', 'astropy__astropy_13398_queries', 'astropy__astropy_13438_queries', 'astropy__astropy_14439_queries', 'astropy__astropy_14701_queries', 'astropy__astropy_14966_queries', 'astropy__astropy_7441_queries', 'astropy__astropy_8707_queries', 'django__django_11501_queries', 'django__django_12091_queries', 'django__django_13192_queries', 'django__django_13218_queries', 'django__django_13884_queries', 'django__django_14441_queries', 'django__django_15481_queries', 'django__django_15869_queries', 'django__django_16901_queries', 'django__django_17065_queries', 'matplotlib__matplotlib_20518_queries', 'matplotlib__matplotlib_23314_queries', 'matplotlib__matplotlib_23913_queries', 'matplotlib__matplotlib_24627_queries', 'matplotlib__matplotlib_24849_queries', 'matplotlib__matplotlib_25027_queries', 'matplotlib__matplotlib_25238_queries', 'matplotlib__matplotlib_25404_queries', 'matplotlib__matplotlib_25430_queries', 'matplotlib__matplotlib_25746_queries', 'mwaskom__seaborn_2389_queries', 'mwaskom__seaborn_2576_queries', 'mwaskom__seaborn_2766_queries', 'mwaskom__seaborn_2813_queries', 'mwaskom__seaborn_2853_queries', 'mwaskom__seaborn_2946_queries', 'mwaskom__seaborn_2979_queries', 'mwaskom__seaborn_2996_queries', 'mwaskom__seaborn_3202_queries', 'mwaskom__seaborn_3407_queries', 'pallets__flask_4045_queries', 'pallets__flask_4074_queries', 'pallets__flask_4160_queries', 'pallets__flask_4169_queries', 'pallets__flask_4544_queries', 'pallets__flask_4575_queries', 'pallets__flask_4642_queries', 'pallets__flask_4992_queries', 'pallets__flask_5014_queries', 'pallets__flask_5063_queries', 'psf__requests_1537_queries', 'psf__requests_1713_queries', 'psf__requests_1733_queries', 'psf__requests_1766_queries', 'psf__requests_2193_queries', 'psf__requests_2466_queries', 'psf__requests_2821_queries', 'psf__requests_3362_queries', 'psf__requests_5414_queries', 'psf__requests_863_queries', 'pydata__xarray_4339_queries', 'pydata__xarray_4767_queries', 'pydata__xarray_4827_queries', 'pydata__xarray_4911_queries', 'pydata__xarray_4966_queries', 'pydata__xarray_5033_queries', 'pydata__xarray_5682_queries', 'pydata__xarray_6135_queries', 'pydata__xarray_6461_queries', 'pydata__xarray_7391_queries', 'pylint_dev__pylint_4398_queries', 'pylint_dev__pylint_4604_queries', 'pylint_dev__pylint_5175_queries', 'pylint_dev__pylint_5446_queries', 'pylint_dev__pylint_5613_queries', 'pylint_dev__pylint_6358_queries', 'pylint_dev__pylint_6412_queries', 'pylint_dev__pylint_6556_queries', 'pylint_dev__pylint_8281_queries', 'pylint_dev__pylint_8757_queries', 'pytest_dev__pytest_10371_queries', 'pytest_dev__pytest_11047_queries', 'pytest_dev__pytest_11148_queries', 'pytest_dev__pytest_5356_queries', 'pytest_dev__pytest_6680_queries', 'pytest_dev__pytest_7158_queries', 'pytest_dev__pytest_7352_queries', 'pytest_dev__pytest_9064_queries', 'pytest_dev__pytest_9279_queries', 'scikit_learn__scikit_learn_10198_queries', 'scikit_learn__scikit_learn_10803_queries', 'scikit_learn__scikit_learn_10949_queries', 'scikit_learn__scikit_learn_11333_queries', 'scikit_learn__scikit_learn_11635_queries', 'scikit_learn__scikit_learn_12827_queries', 'scikit_learn__scikit_learn_12834_queries', 'scikit_learn__scikit_learn_13302_queries', 'scikit_learn__scikit_learn_13392_queries', 'scikit_learn__scikit_learn_13779_queries', 'sphinx_doc__sphinx_11312_queries', 'sphinx_doc__sphinx_11502_queries', 'sphinx_doc__sphinx_7356_queries', 'sphinx_doc__sphinx_7590_queries', 'sphinx_doc__sphinx_7757_queries', 'sphinx_doc__sphinx_7831_queries', 'sphinx_doc__sphinx_8125_queries', 'sphinx_doc__sphinx_8863_queries', 'sphinx_doc__sphinx_9309_queries', 'sphinx_doc__sphinx_9828_queries', 'sympy__sympy_13091_queries', 'sympy__sympy_14817_queries', 'sympy__sympy_14821_queries', 'sympy__sympy_15151_queries', 'sympy__sympy_15933_queries', 'sympy__sympy_16493_queries', 'sympy__sympy_16858_queries', 'sympy__sympy_17251_queries', 'sympy__sympy_18532_queries', 'sympy__sympy_20212_queries'], 'Apple': ['queries'], 'ConvFinQA': ['queries'], 'FinQA': ['queries'], 'FinanceBench': ['queries'], 'HC3Finance': ['queries'], 'TAT-DQA': ['queries'], 'Trade-the-event': ['queries'], 'AY2': ['queries'], 'ELI5': ['queries'], 'Fever': ['queries'], 'TREx': ['queries'], 'WnCw': ['queries'], 'WnWi': ['queries'], 'WoW': ['queries'], 'zsRE': ['queries'], 'AILA2019-Case': ['queries'], 'AILA2019-Statutes': ['queries'], 'BSARD': ['queries'], 'BillSum': ['queries'], 'CUAD': ['GOOSEHEADINSURANCE_queries', 'GRANTIERRAENERGY_queries', 'HarpoonTherapeutics_queries', 'Monsanto_Company_queries'], 'GerDaLIR': ['queries'], 'LeCaRDv2': ['queries'], 'LegalQuAD': ['queries'], 'REGIR-EU2UK': ['queries'], 'REGIR-UK2EU': ['queries'], 'ArguAna': ['queries'], 'CQADupStack': ['CQADupStack_Android_queries', 'CQADupStack_English_queries', 'CQADupStack_Gaming_queries', 'CQADupStack_Gis_queries', 'CQADupStack_Math_queries', 'CQADupStack_Physics_queries', 'CQADupStack_Programmers_queries', 'CQADupStack_Stats_queries', 'CQADupStack_Tex_queries', 'CQADupStack_Unix_queries', 'CQADupStack_WebMasters_queries', 'CQADupStack_Wordpress_queries'], 'FiQA': ['queries'], 'NFCorpus': ['queries'], 'Quora': ['queries'], 'SciDocs': ['queries'], 'SciFact': ['queries'], 'TopiOCQA': ['queries'], 'Touche': ['queries'], 'Trec-Covid': ['queries'], 'ACORDAR': ['queries'], 'CPCD': ['queries'], 'ChroniclingAmericaQA': ['queries'], 'Monant': ['queries'], 'NTCIR': ['queries'], 'PointRec': ['queries'], 'ProCIS-Dialog': ['queries'], 'ProCIS-Turn': ['queries'], 'QuanTemp': ['queries'], 'WebTableSearch': ['queries'], 'CARE': ['queries'], 'MISeD': ['Bmr006_queries', 'Bro027_queries', 'covid4_queries', 'covid9_queries', 'education4_queries'], 'SParC': ['chinook_1_queries', 'college_2_queries', 'store_1_queries'], 'SParC-SQL': ['chinook_1_queries', 'college_2_queries', 'store_1_queries'], 'Spider': ['chinook_1_queries', 'college_2_queries', 'store_1_queries'], 'Spider-SQL': ['chinook_1_queries', 'college_2_queries', 'store_1_queries'], 'LitSearch': ['queries'], 'CAsT_2019': ['queries'], 'CAsT_2020': ['queries'], 'CAsT_2021': ['queries'], 'CAsT_2022': ['queries'], 'Core_2017': ['queries'], 'Microblog_2011': ['queries'], 'Microblog_2012': ['queries'], 'Microblog_2013': ['queries'], 'Microblog_2014': ['queries'], 'PrecisionMedicine_2017': ['queries'], 'PrecisionMedicine_2018': ['queries'], 'PrecisionMedicine_2019': ['queries'], 'PrecisionMedicine-Article_2019': ['queries'], 'PrecisionMedicine-Article_2020': ['queries'], 'CliniDS_2014': ['queries'], 'CliniDS_2015': ['queries'], 'CliniDS_2016': ['queries'], 'ClinicalTrials_2021': ['queries'], 'ClinicalTrials_2022': ['queries'], 'ClinicalTrials_2023': ['queries'], 'DD_2015': ['queries'], 'DD_2016': ['queries'], 'DD_2017': ['queries'], 'FairRanking_2020': ['queries'], 'FairRanking_2021': ['queries'], 'FairRanking_2022': ['queries'], 'Genomics-AdHoc_2004': ['queries'], 'Genomics-AdHoc_2005': ['queries'], 'Genomics-AdHoc_2006': ['queries'], 'Genomics-AdHoc_2007': ['queries'], 'TREC-Legal_2011': ['queries'], 'NeuCLIR-Tech_2023': ['queries'], 'NeuCLIR_2022': ['queries'], 'NeuCLIR_2023': ['queries'], 'ProductSearch_2023': ['queries'], 'ToT_2023': ['queries'], 'ToT_2024': ['queries'], 'FoodAPI': ['queries'], 'HuggingfaceAPI': ['queries'], 'PytorchAPI': ['queries'], 'SpotifyAPI': ['queries'], 'TMDB': ['queries'], 'TensorAPI': ['queries'], 'ToolBench': ['queries'], 'WeatherAPI': ['queries'], 'ExcluIR': ['queries'], 'Core17': ['queries'], 'News21': ['queries'], 'Robust04': ['queries'], 'InstructIR': ['queries'], 'NevIR': ['queries'], 'IFEval': ['detectable_format__number_bullet_lists_2078_queries', 'detectable_format__number_bullet_lists_102_queries', 'detectable_format__number_bullet_lists_2195_queries', 'detectable_format__number_bullet_lists_2314_queries', 'detectable_format__number_bullet_lists_1934_queries', 'detectable_format__number_bullet_lists_2667_queries', 'detectable_format__number_bullet_lists_1634_queries', 'detectable_format__number_bullet_lists_2100_queries', 'detectable_format__number_bullet_lists_1286_queries', 'detectable_format__number_bullet_lists_2457_queries', 'keywords__letter_frequency_1130_queries', 'keywords__letter_frequency_2107_queries', 'keywords__letter_frequency_1964_queries', 'keywords__letter_frequency_2265_queries', 'detectable_format__constrained_response_3752_queries', 'detectable_format__constrained_response_3755_queries', 'detectable_format__constrained_response_3754_queries', 'detectable_format__constrained_response_3753_queries', 'detectable_format__constrained_response_227_queries', 'detectable_format__constrained_response_3749_queries', 'detectable_format__constrained_response_3756_queries', 'detectable_format__constrained_response_3751_queries', 'detectable_format__constrained_response_3750_queries', 'detectable_format__constrained_response_3757_queries', 'punctuation__no_comma_2245_queries', 'punctuation__no_comma_1107_queries', 'punctuation__no_comma_1162_queries', 'punctuation__no_comma_1418_queries', 'punctuation__no_comma_1001_queries', 'punctuation__no_comma_1187_queries', 'punctuation__no_comma_1738_queries', 'punctuation__no_comma_1300_queries', 'punctuation__no_comma_2069_queries', 'punctuation__no_comma_1643_queries', 'keywords__existence_3156_queries', 'keywords__existence_2485_queries', 'keywords__existence_1531_queries', 'keywords__existence_3732_queries', 'keywords__existence_2662_queries', 'change_case__english_capital_2341_queries', 'change_case__english_capital_3186_queries', 'change_case__english_capital_2563_queries', 'change_case__english_capital_1999_queries', 'change_case__english_capital_24_queries', 'change_case__english_capital_1645_queries', 'change_case__english_lowercase_1122_queries', 'change_case__english_lowercase_1361_queries', 'change_case__english_lowercase_1019_queries', 'change_case__english_lowercase_1087_queries', 'change_case__english_lowercase_1667_queries', 'change_case__english_lowercase_1516_queries', 'change_case__english_lowercase_1535_queries', 'change_case__english_lowercase_1593_queries', 'change_case__english_lowercase_1843_queries', 'keywords__frequency_1393_queries', 'keywords__frequency_1733_queries', 'keywords__frequency_2142_queries', 'keywords__frequency_2292_queries', 'keywords__frequency_1498_queries', 'keywords__frequency_1203_queries', 'keywords__frequency_1857_queries', 'length_constraints__number_sentences_1837_queries', 'length_constraints__number_sentences_2674_queries', 'length_constraints__number_sentences_2617_queries', 'length_constraints__number_sentences_1381_queries', 'length_constraints__number_sentences_2266_queries', 'length_constraints__number_sentences_1268_queries', 'length_constraints__number_sentences_179_queries', 'length_constraints__number_paragraphs_1236_queries', 'length_constraints__number_paragraphs_2941_queries', 'length_constraints__number_paragraphs_1248_queries', 'length_constraints__number_paragraphs_1858_queries', 'length_constraints__number_paragraphs_1377_queries', 'length_constraints__number_paragraphs_2357_queries', 'length_constraints__number_paragraphs_2921_queries', 'length_constraints__number_paragraphs_1082_queries', 'length_constraints__number_paragraphs_2467_queries', 'combination__two_responses_1591_queries', 'combination__two_responses_1793_queries', 'combination__two_responses_2912_queries', 'combination__two_responses_1332_queries', 'combination__two_responses_2383_queries', 'combination__two_responses_136_queries', 'combination__two_responses_1098_queries', 'combination__two_responses_1746_queries', 'combination__two_responses_247_queries', 'combination__two_responses_2918_queries', 'detectable_content__postscript_2273_queries', 'detectable_content__postscript_2070_queries', 'detectable_content__postscript_1800_queries', 'detectable_content__postscript_1305_queries', 'detectable_content__postscript_1759_queries', 'detectable_content__postscript_1367_queries', 'detectable_content__postscript_1537_queries', 'detectable_content__postscript_1879_queries', 'detectable_content__postscript_1246_queries', 'detectable_content__postscript_1620_queries', 'startend__end_checker_2398_queries', 'startend__end_checker_1902_queries', 'startend__end_checker_2268_queries', 'startend__end_checker_1659_queries', 'startend__end_checker_1893_queries', 'startend__end_checker_2475_queries', 'startend__end_checker_1128_queries', 'startend__end_checker_1939_queries', 'startend__end_checker_1446_queries', 'startend__end_checker_1220_queries', 'detectable_content__number_placeholders_3280_queries', 'detectable_content__number_placeholders_1372_queries', 'detectable_content__number_placeholders_3221_queries', 'detectable_content__number_placeholders_1927_queries', 'detectable_content__number_placeholders_3126_queries', 'detectable_content__number_placeholders_2164_queries', 'detectable_content__number_placeholders_2136_queries', 'detectable_content__number_placeholders_2304_queries', 'detectable_content__number_placeholders_3743_queries', 'length_constraints__number_words_2323_queries', 'length_constraints__number_words_1072_queries', 'length_constraints__number_words_1258_queries', 'length_constraints__number_words_1251_queries', 'length_constraints__number_words_164_queries', 'detectable_format__number_highlighted_sections_168_queries', 'detectable_format__number_highlighted_sections_1237_queries', 'detectable_format__number_highlighted_sections_1601_queries', 'detectable_format__number_highlighted_sections_167_queries', 'detectable_format__number_highlighted_sections_1773_queries', 'detectable_format__number_highlighted_sections_1646_queries', 'detectable_format__number_highlighted_sections_1379_queries', 'detectable_format__number_highlighted_sections_1307_queries', 'detectable_format__number_highlighted_sections_1886_queries', 'detectable_format__number_highlighted_sections_1644_queries', 'detectable_format__json_format_1094_queries', 'detectable_format__json_format_1148_queries', 'detectable_format__json_format_1137_queries', 'detectable_format__json_format_1075_queries', 'detectable_format__json_format_2857_queries', 'detectable_format__json_format_3223_queries', 'detectable_format__json_format_2404_queries', 'detectable_format__json_format_321_queries', 'detectable_format__json_format_13_queries', 'change_case__capital_word_frequency_2820_queries', 'change_case__capital_word_frequency_2849_queries', 'change_case__capital_word_frequency_2870_queries', 'change_case__capital_word_frequency_1592_queries', 'detectable_format__multiple_sections_2023_queries', 'detectable_format__multiple_sections_1548_queries', 'detectable_format__multiple_sections_2925_queries', 'detectable_format__multiple_sections_1131_queries', 'detectable_format__multiple_sections_357_queries', 'startend__quotation_2015_queries', 'startend__quotation_219_queries', 'startend__quotation_2010_queries', 'startend__quotation_1658_queries', 'startend__quotation_1325_queries', 'startend__quotation_1776_queries', 'startend__quotation_2239_queries', 'startend__quotation_1845_queries', 'startend__quotation_2209_queries', 'length_constraints__nth_paragraph_first_word_2880_queries', 'length_constraints__nth_paragraph_first_word_181_queries', 'length_constraints__nth_paragraph_first_word_2250_queries', 'length_constraints__nth_paragraph_first_word_2215_queries', 'length_constraints__nth_paragraph_first_word_3073_queries', 'length_constraints__nth_paragraph_first_word_2590_queries', 'length_constraints__nth_paragraph_first_word_3624_queries', 'length_constraints__nth_paragraph_first_word_1954_queries', 'detectable_format__title_1262_queries', 'detectable_format__title_2229_queries', 'detectable_format__title_295_queries', 'detectable_format__title_2097_queries', 'detectable_format__title_1802_queries', 'detectable_format__title_1322_queries', 'detectable_format__title_2969_queries', 'detectable_format__title_3057_queries', 'detectable_format__title_1551_queries', 'detectable_format__title_2807_queries']} + +MAIR_TASK_CONFIG = {'Competition-Math': 'Academic', 'ProofWiki_Proof': 'Academic', 'ProofWiki_Reference': 'Academic', 'Stacks_Proof': 'Academic', 'Stacks_Reference': 'Academic', 'Stein_Proof': 'Academic', 'Stein_Reference': 'Academic', 'Trench_Proof': 'Academic', 'Trench_Reference': 'Academic', 'TAD': 'Academic', 'TAS2': 'Academic', 'StackMathQA': 'Academic', 'APPS': 'Code', 'CodeEditSearch': 'Code', 'CodeSearchNet': 'Code', 'Conala': 'Code', 'HumanEval-X': 'Code', 'LeetCode': 'Code', 'MBPP': 'Code', 'RepoBench': 'Code', 'TLDR': 'Code', 'SWE-Bench-Lite': 'Code', 'Apple': 'Finance', 'ConvFinQA': 'Finance', 'FinQA': 'Finance', 'FinanceBench': 'Finance', 'HC3Finance': 'Finance', 'TAT-DQA': 'Finance', 'Trade-the-event': 'Finance', 'AY2': 'Web', 'ELI5': 'Web', 'Fever': 'Web', 'TREx': 'Web', 'WnCw': 'Web', 'WnWi': 'Web', 'WoW': 'Web', 'zsRE': 'Web', 'AILA2019-Case': 'Legal', 'AILA2019-Statutes': 'Legal', 'BSARD': 'Legal', 'BillSum': 'Legal', 'CUAD': 'Legal', 'GerDaLIR': 'Legal', 'LeCaRDv2': 'Legal', 'LegalQuAD': 'Legal', 'REGIR-EU2UK': 'Legal', 'REGIR-UK2EU': 'Legal', 'ArguAna': 'Web', 'CQADupStack': 'Web', 'FiQA': 'Finance', 'NFCorpus': 'Medical', 'Quora': 'Web', 'SciDocs': 'Academic', 'SciFact': 'Academic', 'TopiOCQA': 'Web', 'Touche': 'Web', 'Trec-Covid': 'Medical', 'ACORDAR': 'Web', 'CPCD': 'Web', 'ChroniclingAmericaQA': 'Web', 'Monant': 'Medical', 'NTCIR': 'Web', 'PointRec': 'Web', 'ProCIS-Dialog': 'Web', 'ProCIS-Turn': 'Web', 'QuanTemp': 'Web', 'WebTableSearch': 'Web', 'CARE': 'Medical', 'MISeD': 'Web', 'SParC': 'Web', 'SParC-SQL': 'Web', 'Spider': 'Web', 'Spider-SQL': 'Web', 'LitSearch': 'Academic', 'CAsT_2019': 'Web', 'CAsT_2020': 'Web', 'CAsT_2021': 'Web', 'CAsT_2022': 'Web', 'Core_2017': 'Web', 'Microblog_2011': 'Web', 'Microblog_2012': 'Web', 'Microblog_2013': 'Web', 'Microblog_2014': 'Web', 'PrecisionMedicine_2017': 'Medical', 'PrecisionMedicine_2018': 'Medical', 'PrecisionMedicine_2019': 'Medical', 'PrecisionMedicine-Article_2019': 'Medical', 'PrecisionMedicine-Article_2020': 'Medical', 'CliniDS_2014': 'Medical', 'CliniDS_2015': 'Medical', 'CliniDS_2016': 'Medical', 'ClinicalTrials_2021': 'Medical', 'ClinicalTrials_2022': 'Medical', 'ClinicalTrials_2023': 'Medical', 'DD_2015': 'Web', 'DD_2016': 'Web', 'DD_2017': 'Web', 'FairRanking_2020': 'Academic', 'FairRanking_2021': 'Web', 'FairRanking_2022': 'Web', 'Genomics-AdHoc_2004': 'Medical', 'Genomics-AdHoc_2005': 'Medical', 'Genomics-AdHoc_2006': 'Medical', 'Genomics-AdHoc_2007': 'Medical', 'TREC-Legal_2011': 'Legal', 'NeuCLIR-Tech_2023': 'Web', 'NeuCLIR_2022': 'Web', 'NeuCLIR_2023': 'Web', 'ProductSearch_2023': 'Web', 'ToT_2023': 'Web', 'ToT_2024': 'Web', 'FoodAPI': 'Code', 'HuggingfaceAPI': 'Code', 'PytorchAPI': 'Code', 'SpotifyAPI': 'Code', 'TMDB': 'Code', 'TensorAPI': 'Code', 'ToolBench': 'Code', 'WeatherAPI': 'Code', 'ExcluIR': 'Web', 'Core17': 'Web', 'News21': 'Web', 'Robust04': 'Web', 'InstructIR': 'Web', 'NevIR': 'Web', 'IFEval': 'Web'} + +_MAIR_CITATION = """@inproceedings{Sun2024MAIR, + title={MAIR: A Massive Benchmark for Evaluating Instructed Retrieval}, + author={Weiwei Sun and Zhengliang Shi and Jiulong Wu and Lingyong Yan and Xinyu Ma and Yiding Liu and Min Cao and Dawei Yin and Zhaochun Ren}, + booktitle={EMNLP}, + year={2024}, +}""" + + +def get_metadata(task_name): + return TaskMetadata( + name="MAIR-" + task_name, + description=""" + Recent information retrieval (IR) models are pre-trained and instruction-tuned on massive datasets and tasks, enabling them to perform well on a wide range of tasks and potentially generalize to unseen tasks with instructions. However, existing IR benchmarks focus on a limited scope of tasks, making them insufficient for evaluating the latest IR models. In this paper, we propose MAIR (Massive Instructed Retrieval Benchmark), a heterogeneous IR benchmark that includes 126 distinct IR tasks across 6 domains, collected from existing datasets. We benchmark state-of-the-art instruction-tuned text embedding models and re-ranking models. Our experiments reveal that instruction-tuned models generally achieve superior performance compared to non-instruction-tuned models on MAIR. Additionally, our results suggest that current instruction-tuned text embedding models and re-ranking models still lack effectiveness in specific long-tail tasks. + """, + reference="https://github.com/sunnweiwei/MAIR", + dataset={ + "path": "MAIR-Bench/MAIR", + "revision": "7d24eac886a6ae6653a6b67433e1c302cb0e9ac6", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=TASK2SPLIT.get(task_name, []), + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2023-07-10", "2024-11-10"), + domains=["Web"], + task_subtypes=["Question answering"], + license="mit", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", # queries LLM generated, corpus samples are found (extracted from S2ORC) + bibtex_citation=_MAIR_CITATION, + ) + + +def load_data(self, **kwargs): + if self.data_loaded: + return + self.corpus, self.queries, self.relevant_docs = {}, {}, {} + queries_path = self.metadata_dict["dataset"]["path"] + '-Queries' + docs_path = self.metadata_dict["dataset"]["path"] + '-Docs' + task_name = self.metadata.name.replace("MAIR-", "") + query_ds = datasets.load_dataset(queries_path, task_name) + corpus_ds = datasets.load_dataset(docs_path, task_name) + self.metadata.eval_splits = [] + for split in query_ds: + doc_split = 'docs' if split == 'queries' else split.replace('_queries', '_docs') + self.queries[split] = {item['qid']: item['query'] for item in query_ds[split]} + self.corpus[split] = {item['id']: {'title': '', 'text': item['doc']} for item in corpus_ds[doc_split]} + self.relevant_docs[split] = { + item['qid']: {d['id']: d['score'] for d in item['labels']} + for item in query_ds[split] + } + self.metadata.eval_splits.append(split) + + self.data_loaded = True + +for _task in TASK2SPLIT.keys(): + _class_name = _task.replace('-', '_') + _new_class = type( + _class_name, + (AbsTaskRetrieval,), + { + 'metadata': get_metadata(_task), + 'load_data': load_data + } + ) + globals()[_class_name] = _new_class + __all__.append(_class_name) From 450b1178a17e1c84d923fed2a29ec4d7bb865433 Mon Sep 17 00:00:00 2001 From: Weiwei Sun <68775773+sunnweiwei@users.noreply.github.com> Date: Sun, 10 Nov 2024 01:09:59 -0500 Subject: [PATCH 02/11] Create __init__.py --- mteb/tasks/MAIR/eng/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 mteb/tasks/MAIR/eng/__init__.py diff --git a/mteb/tasks/MAIR/eng/__init__.py b/mteb/tasks/MAIR/eng/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/mteb/tasks/MAIR/eng/__init__.py @@ -0,0 +1 @@ + From 19737df3148938fd1b84f72e3d60e0a55b4cdc1e Mon Sep 17 00:00:00 2001 From: Weiwei Sun <68775773+sunnweiwei@users.noreply.github.com> Date: Sun, 10 Nov 2024 01:10:15 -0500 Subject: [PATCH 03/11] Create __init__.py --- mteb/tasks/MAIR/__init__.py | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 mteb/tasks/MAIR/__init__.py diff --git a/mteb/tasks/MAIR/__init__.py b/mteb/tasks/MAIR/__init__.py new file mode 100644 index 000000000..f1598f4c5 --- /dev/null +++ b/mteb/tasks/MAIR/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from .eng.MAIR import * From ba8e3edd1678658b51ea7fb8b8d1ae76005eda4c Mon Sep 17 00:00:00 2001 From: Weiwei Sun <68775773+sunnweiwei@users.noreply.github.com> Date: Sun, 10 Nov 2024 01:11:52 -0500 Subject: [PATCH 04/11] Update benchmarks.py --- mteb/benchmarks/benchmarks.py | 94 +++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index c5181d0ab..cd0d584df 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -918,3 +918,97 @@ def load_results( reference=None, citation=None, ) + + +MAIR_TASK_CONFIG = {'Competition-Math': 'Academic', 'ProofWiki_Proof': 'Academic', 'ProofWiki_Reference': 'Academic', 'Stacks_Proof': 'Academic', 'Stacks_Reference': 'Academic', 'Stein_Proof': 'Academic', 'Stein_Reference': 'Academic', 'Trench_Proof': 'Academic', 'Trench_Reference': 'Academic', 'TAD': 'Academic', 'TAS2': 'Academic', 'StackMathQA': 'Academic', 'APPS': 'Code', 'CodeEditSearch': 'Code', 'CodeSearchNet': 'Code', 'Conala': 'Code', 'HumanEval-X': 'Code', 'LeetCode': 'Code', 'MBPP': 'Code', 'RepoBench': 'Code', 'TLDR': 'Code', 'SWE-Bench-Lite': 'Code', 'Apple': 'Finance', 'ConvFinQA': 'Finance', 'FinQA': 'Finance', 'FinanceBench': 'Finance', 'HC3Finance': 'Finance', 'TAT-DQA': 'Finance', 'Trade-the-event': 'Finance', 'AY2': 'Web', 'ELI5': 'Web', 'Fever': 'Web', 'TREx': 'Web', 'WnCw': 'Web', 'WnWi': 'Web', 'WoW': 'Web', 'zsRE': 'Web', 'AILA2019-Case': 'Legal', 'AILA2019-Statutes': 'Legal', 'BSARD': 'Legal', 'BillSum': 'Legal', 'CUAD': 'Legal', 'GerDaLIR': 'Legal', 'LeCaRDv2': 'Legal', 'LegalQuAD': 'Legal', 'REGIR-EU2UK': 'Legal', 'REGIR-UK2EU': 'Legal', 'ArguAna': 'Web', 'CQADupStack': 'Web', 'FiQA': 'Finance', 'NFCorpus': 'Medical', 'Quora': 'Web', 'SciDocs': 'Academic', 'SciFact': 'Academic', 'TopiOCQA': 'Web', 'Touche': 'Web', 'Trec-Covid': 'Medical', 'ACORDAR': 'Web', 'CPCD': 'Web', 'ChroniclingAmericaQA': 'Web', 'Monant': 'Medical', 'NTCIR': 'Web', 'PointRec': 'Web', 'ProCIS-Dialog': 'Web', 'ProCIS-Turn': 'Web', 'QuanTemp': 'Web', 'WebTableSearch': 'Web', 'CARE': 'Medical', 'MISeD': 'Web', 'SParC': 'Web', 'SParC-SQL': 'Web', 'Spider': 'Web', 'Spider-SQL': 'Web', 'LitSearch': 'Academic', 'CAsT_2019': 'Web', 'CAsT_2020': 'Web', 'CAsT_2021': 'Web', 'CAsT_2022': 'Web', 'Core_2017': 'Web', 'Microblog_2011': 'Web', 'Microblog_2012': 'Web', 'Microblog_2013': 'Web', 'Microblog_2014': 'Web', 'PrecisionMedicine_2017': 'Medical', 'PrecisionMedicine_2018': 'Medical', 'PrecisionMedicine_2019': 'Medical', 'PrecisionMedicine-Article_2019': 'Medical', 'PrecisionMedicine-Article_2020': 'Medical', 'CliniDS_2014': 'Medical', 'CliniDS_2015': 'Medical', 'CliniDS_2016': 'Medical', 'ClinicalTrials_2021': 'Medical', 'ClinicalTrials_2022': 'Medical', 'ClinicalTrials_2023': 'Medical', 'DD_2015': 'Web', 'DD_2016': 'Web', 'DD_2017': 'Web', 'FairRanking_2020': 'Academic', 'FairRanking_2021': 'Web', 'FairRanking_2022': 'Web', 'Genomics-AdHoc_2004': 'Medical', 'Genomics-AdHoc_2005': 'Medical', 'Genomics-AdHoc_2006': 'Medical', 'Genomics-AdHoc_2007': 'Medical', 'TREC-Legal_2011': 'Legal', 'NeuCLIR-Tech_2023': 'Web', 'NeuCLIR_2022': 'Web', 'NeuCLIR_2023': 'Web', 'ProductSearch_2023': 'Web', 'ToT_2023': 'Web', 'ToT_2024': 'Web', 'FoodAPI': 'Code', 'HuggingfaceAPI': 'Code', 'PytorchAPI': 'Code', 'SpotifyAPI': 'Code', 'TMDB': 'Code', 'TensorAPI': 'Code', 'ToolBench': 'Code', 'WeatherAPI': 'Code', 'ExcluIR': 'Web', 'Core17': 'Web', 'News21': 'Web', 'Robust04': 'Web', 'InstructIR': 'Web', 'NevIR': 'Web', 'IFEval': 'Web'} + + +def _get_mair_tasks_by_domain(domain): + assert domain in ['Academic', 'Code', 'Web', 'Legal', 'Medical', 'Finance'] + out = [] + for task in MAIR_TASK_CONFIG: + if MAIR_TASK_CONFIG[task] == domain: + out.append(task) + return out + + +def _get_mair_all_tasks(): + return list(MAIR_TASK_CONFIG.keys()) + + +_MAIR_CITATION = """@inproceedings{Sun2024MAIR, + title={MAIR: A Massive Benchmark for Evaluating Instructed Retrieval}, + author={Weiwei Sun and Zhengliang Shi and Jiulong Wu and Lingyong Yan and Xinyu Ma and Yiding Liu and Min Cao and Dawei Yin and Zhaochun Ren}, + booktitle={EMNLP}, + year={2024}, +}""" + +MAIR = Benchmark( + name="MAIR", + tasks=get_tasks( + tasks=['MAIR-' + name for name in _get_mair_all_tasks()] + ), + description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", + reference="https://github.com/sunnweiwei/MAIR", + citation=_MAIR_CITATION, +) + +MAIR_WEB = Benchmark( + name="MAIR(Web)", + tasks=get_tasks( + tasks=['MAIR-' + name for name in _get_mair_tasks_by_domain('Web')] + ), + description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", + reference="https://github.com/sunnweiwei/MAIR", + citation=_MAIR_CITATION, +) + +MAIR_CODE = Benchmark( + name="MAIR(Code)", + tasks=get_tasks( + tasks=['MAIR-' + name for name in _get_mair_tasks_by_domain('Code')] + ), + description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", + reference="https://github.com/sunnweiwei/MAIR", + citation=_MAIR_CITATION, +) + +MAIR_ACADEMIC = Benchmark( + name="MAIR(Academic)", + tasks=get_tasks( + tasks=['MAIR-' + name for name in _get_mair_tasks_by_domain('Academic')] + ), + description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", + reference="https://github.com/sunnweiwei/MAIR", + citation=_MAIR_CITATION, +) + +MAIR_LEGAL = Benchmark( + name="MAIR(Legal)", + tasks=get_tasks( + tasks=['MAIR-' + name for name in _get_mair_tasks_by_domain('Legal')] + ), + description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", + reference="https://github.com/sunnweiwei/MAIR", + citation=_MAIR_CITATION, +) + +MAIR_MEDICAL = Benchmark( + name="MAIR(Medical)", + tasks=get_tasks( + tasks=['MAIR-' + name for name in _get_mair_tasks_by_domain('Medical')] + ), + description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", + reference="https://github.com/sunnweiwei/MAIR", + citation=_MAIR_CITATION, +) + +MAIR_FINANCE = Benchmark( + name="MAIR(Finance)", + tasks=get_tasks( + tasks=['MAIR-' + name for name in _get_mair_tasks_by_domain('Finance')] + ), + description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", + reference="https://github.com/sunnweiwei/MAIR", + citation=_MAIR_CITATION, +) From a1769592768d4089e5f6086f4a736ac7256d2484 Mon Sep 17 00:00:00 2001 From: Weiwei Sun <68775773+sunnweiwei@users.noreply.github.com> Date: Sun, 10 Nov 2024 01:38:42 -0500 Subject: [PATCH 05/11] Update __init__.py --- mteb/tasks/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index dfe568bb8..2b0255fc3 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -11,3 +11,4 @@ from .SpeedTask import * from .STS import * from .Summarization import * +from .MAIR import * From f19dc81da1420a1af77101d8bcd0a6dc7eeacb24 Mon Sep 17 00:00:00 2001 From: Weiwei Sun <68775773+sunnweiwei@users.noreply.github.com> Date: Sun, 10 Nov 2024 01:52:20 -0500 Subject: [PATCH 06/11] Update MAIR.py --- mteb/tasks/MAIR/eng/MAIR.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mteb/tasks/MAIR/eng/MAIR.py b/mteb/tasks/MAIR/eng/MAIR.py index 2baa25648..801ed8e27 100644 --- a/mteb/tasks/MAIR/eng/MAIR.py +++ b/mteb/tasks/MAIR/eng/MAIR.py @@ -22,12 +22,10 @@ def get_metadata(task_name): return TaskMetadata( name="MAIR-" + task_name, - description=""" - Recent information retrieval (IR) models are pre-trained and instruction-tuned on massive datasets and tasks, enabling them to perform well on a wide range of tasks and potentially generalize to unseen tasks with instructions. However, existing IR benchmarks focus on a limited scope of tasks, making them insufficient for evaluating the latest IR models. In this paper, we propose MAIR (Massive Instructed Retrieval Benchmark), a heterogeneous IR benchmark that includes 126 distinct IR tasks across 6 domains, collected from existing datasets. We benchmark state-of-the-art instruction-tuned text embedding models and re-ranking models. Our experiments reveal that instruction-tuned models generally achieve superior performance compared to non-instruction-tuned models on MAIR. Additionally, our results suggest that current instruction-tuned text embedding models and re-ranking models still lack effectiveness in specific long-tail tasks. - """, + description="Recent information retrieval (IR) models are pre-trained and instruction-tuned on massive datasets and tasks, enabling them to perform well on a wide range of tasks and potentially generalize to unseen tasks with instructions. However, existing IR benchmarks focus on a limited scope of tasks, making them insufficient for evaluating the latest IR models. In this paper, we propose MAIR (Massive Instructed Retrieval Benchmark), a heterogeneous IR benchmark that includes 126 distinct IR tasks across 6 domains, collected from existing datasets. We benchmark state-of-the-art instruction-tuned text embedding models and re-ranking models. Our experiments reveal that instruction-tuned models generally achieve superior performance compared to non-instruction-tuned models on MAIR. Additionally, our results suggest that current instruction-tuned text embedding models and re-ranking models still lack effectiveness in specific long-tail tasks.", reference="https://github.com/sunnweiwei/MAIR", dataset={ - "path": "MAIR-Bench/MAIR", + "path": "MAIR-Bench/MAIR-Queries", "revision": "7d24eac886a6ae6653a6b67433e1c302cb0e9ac6", }, type="Retrieval", @@ -51,8 +49,8 @@ def load_data(self, **kwargs): if self.data_loaded: return self.corpus, self.queries, self.relevant_docs = {}, {}, {} - queries_path = self.metadata_dict["dataset"]["path"] + '-Queries' - docs_path = self.metadata_dict["dataset"]["path"] + '-Docs' + queries_path = self.metadata_dict["dataset"]["path"] + docs_path = self.metadata_dict["dataset"]["path"].replace('-Queries', '-Docs') task_name = self.metadata.name.replace("MAIR-", "") query_ds = datasets.load_dataset(queries_path, task_name) corpus_ds = datasets.load_dataset(docs_path, task_name) From 4c85657535bfbebba1eb503690c412063fcf153f Mon Sep 17 00:00:00 2001 From: Weiwei Sun <68775773+sunnweiwei@users.noreply.github.com> Date: Sun, 10 Nov 2024 02:24:43 -0500 Subject: [PATCH 07/11] Update MAIR.py --- mteb/tasks/MAIR/eng/MAIR.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/mteb/tasks/MAIR/eng/MAIR.py b/mteb/tasks/MAIR/eng/MAIR.py index 801ed8e27..55c50018f 100644 --- a/mteb/tasks/MAIR/eng/MAIR.py +++ b/mteb/tasks/MAIR/eng/MAIR.py @@ -1,15 +1,16 @@ from __future__ import annotations import datasets +import json from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval from mteb.abstasks.TaskMetadata import TaskMetadata __all__ = [] -TASK2SPLIT = {'Competition-Math': ['queries'], 'ProofWiki_Proof': ['queries'], 'ProofWiki_Reference': ['queries'], 'Stacks_Proof': ['queries'], 'Stacks_Reference': ['queries'], 'Stein_Proof': ['queries'], 'Stein_Reference': ['queries'], 'Trench_Proof': ['queries'], 'Trench_Reference': ['queries'], 'TAD': ['queries'], 'TAS2': ['queries'], 'StackMathQA': ['queries'], 'APPS': ['queries'], 'CodeEditSearch': ['queries'], 'CodeSearchNet': ['queries'], 'Conala': ['queries'], 'HumanEval-X': ['queries'], 'LeetCode': ['queries'], 'MBPP': ['queries'], 'RepoBench': ['queries'], 'TLDR': ['queries'], 'SWE-Bench-Lite': ['astropy__astropy_12544_queries', 'astropy__astropy_13158_queries', 'astropy__astropy_13162_queries', 'astropy__astropy_13398_queries', 'astropy__astropy_13438_queries', 'astropy__astropy_14439_queries', 'astropy__astropy_14701_queries', 'astropy__astropy_14966_queries', 'astropy__astropy_7441_queries', 'astropy__astropy_8707_queries', 'django__django_11501_queries', 'django__django_12091_queries', 'django__django_13192_queries', 'django__django_13218_queries', 'django__django_13884_queries', 'django__django_14441_queries', 'django__django_15481_queries', 'django__django_15869_queries', 'django__django_16901_queries', 'django__django_17065_queries', 'matplotlib__matplotlib_20518_queries', 'matplotlib__matplotlib_23314_queries', 'matplotlib__matplotlib_23913_queries', 'matplotlib__matplotlib_24627_queries', 'matplotlib__matplotlib_24849_queries', 'matplotlib__matplotlib_25027_queries', 'matplotlib__matplotlib_25238_queries', 'matplotlib__matplotlib_25404_queries', 'matplotlib__matplotlib_25430_queries', 'matplotlib__matplotlib_25746_queries', 'mwaskom__seaborn_2389_queries', 'mwaskom__seaborn_2576_queries', 'mwaskom__seaborn_2766_queries', 'mwaskom__seaborn_2813_queries', 'mwaskom__seaborn_2853_queries', 'mwaskom__seaborn_2946_queries', 'mwaskom__seaborn_2979_queries', 'mwaskom__seaborn_2996_queries', 'mwaskom__seaborn_3202_queries', 'mwaskom__seaborn_3407_queries', 'pallets__flask_4045_queries', 'pallets__flask_4074_queries', 'pallets__flask_4160_queries', 'pallets__flask_4169_queries', 'pallets__flask_4544_queries', 'pallets__flask_4575_queries', 'pallets__flask_4642_queries', 'pallets__flask_4992_queries', 'pallets__flask_5014_queries', 'pallets__flask_5063_queries', 'psf__requests_1537_queries', 'psf__requests_1713_queries', 'psf__requests_1733_queries', 'psf__requests_1766_queries', 'psf__requests_2193_queries', 'psf__requests_2466_queries', 'psf__requests_2821_queries', 'psf__requests_3362_queries', 'psf__requests_5414_queries', 'psf__requests_863_queries', 'pydata__xarray_4339_queries', 'pydata__xarray_4767_queries', 'pydata__xarray_4827_queries', 'pydata__xarray_4911_queries', 'pydata__xarray_4966_queries', 'pydata__xarray_5033_queries', 'pydata__xarray_5682_queries', 'pydata__xarray_6135_queries', 'pydata__xarray_6461_queries', 'pydata__xarray_7391_queries', 'pylint_dev__pylint_4398_queries', 'pylint_dev__pylint_4604_queries', 'pylint_dev__pylint_5175_queries', 'pylint_dev__pylint_5446_queries', 'pylint_dev__pylint_5613_queries', 'pylint_dev__pylint_6358_queries', 'pylint_dev__pylint_6412_queries', 'pylint_dev__pylint_6556_queries', 'pylint_dev__pylint_8281_queries', 'pylint_dev__pylint_8757_queries', 'pytest_dev__pytest_10371_queries', 'pytest_dev__pytest_11047_queries', 'pytest_dev__pytest_11148_queries', 'pytest_dev__pytest_5356_queries', 'pytest_dev__pytest_6680_queries', 'pytest_dev__pytest_7158_queries', 'pytest_dev__pytest_7352_queries', 'pytest_dev__pytest_9064_queries', 'pytest_dev__pytest_9279_queries', 'scikit_learn__scikit_learn_10198_queries', 'scikit_learn__scikit_learn_10803_queries', 'scikit_learn__scikit_learn_10949_queries', 'scikit_learn__scikit_learn_11333_queries', 'scikit_learn__scikit_learn_11635_queries', 'scikit_learn__scikit_learn_12827_queries', 'scikit_learn__scikit_learn_12834_queries', 'scikit_learn__scikit_learn_13302_queries', 'scikit_learn__scikit_learn_13392_queries', 'scikit_learn__scikit_learn_13779_queries', 'sphinx_doc__sphinx_11312_queries', 'sphinx_doc__sphinx_11502_queries', 'sphinx_doc__sphinx_7356_queries', 'sphinx_doc__sphinx_7590_queries', 'sphinx_doc__sphinx_7757_queries', 'sphinx_doc__sphinx_7831_queries', 'sphinx_doc__sphinx_8125_queries', 'sphinx_doc__sphinx_8863_queries', 'sphinx_doc__sphinx_9309_queries', 'sphinx_doc__sphinx_9828_queries', 'sympy__sympy_13091_queries', 'sympy__sympy_14817_queries', 'sympy__sympy_14821_queries', 'sympy__sympy_15151_queries', 'sympy__sympy_15933_queries', 'sympy__sympy_16493_queries', 'sympy__sympy_16858_queries', 'sympy__sympy_17251_queries', 'sympy__sympy_18532_queries', 'sympy__sympy_20212_queries'], 'Apple': ['queries'], 'ConvFinQA': ['queries'], 'FinQA': ['queries'], 'FinanceBench': ['queries'], 'HC3Finance': ['queries'], 'TAT-DQA': ['queries'], 'Trade-the-event': ['queries'], 'AY2': ['queries'], 'ELI5': ['queries'], 'Fever': ['queries'], 'TREx': ['queries'], 'WnCw': ['queries'], 'WnWi': ['queries'], 'WoW': ['queries'], 'zsRE': ['queries'], 'AILA2019-Case': ['queries'], 'AILA2019-Statutes': ['queries'], 'BSARD': ['queries'], 'BillSum': ['queries'], 'CUAD': ['GOOSEHEADINSURANCE_queries', 'GRANTIERRAENERGY_queries', 'HarpoonTherapeutics_queries', 'Monsanto_Company_queries'], 'GerDaLIR': ['queries'], 'LeCaRDv2': ['queries'], 'LegalQuAD': ['queries'], 'REGIR-EU2UK': ['queries'], 'REGIR-UK2EU': ['queries'], 'ArguAna': ['queries'], 'CQADupStack': ['CQADupStack_Android_queries', 'CQADupStack_English_queries', 'CQADupStack_Gaming_queries', 'CQADupStack_Gis_queries', 'CQADupStack_Math_queries', 'CQADupStack_Physics_queries', 'CQADupStack_Programmers_queries', 'CQADupStack_Stats_queries', 'CQADupStack_Tex_queries', 'CQADupStack_Unix_queries', 'CQADupStack_WebMasters_queries', 'CQADupStack_Wordpress_queries'], 'FiQA': ['queries'], 'NFCorpus': ['queries'], 'Quora': ['queries'], 'SciDocs': ['queries'], 'SciFact': ['queries'], 'TopiOCQA': ['queries'], 'Touche': ['queries'], 'Trec-Covid': ['queries'], 'ACORDAR': ['queries'], 'CPCD': ['queries'], 'ChroniclingAmericaQA': ['queries'], 'Monant': ['queries'], 'NTCIR': ['queries'], 'PointRec': ['queries'], 'ProCIS-Dialog': ['queries'], 'ProCIS-Turn': ['queries'], 'QuanTemp': ['queries'], 'WebTableSearch': ['queries'], 'CARE': ['queries'], 'MISeD': ['Bmr006_queries', 'Bro027_queries', 'covid4_queries', 'covid9_queries', 'education4_queries'], 'SParC': ['chinook_1_queries', 'college_2_queries', 'store_1_queries'], 'SParC-SQL': ['chinook_1_queries', 'college_2_queries', 'store_1_queries'], 'Spider': ['chinook_1_queries', 'college_2_queries', 'store_1_queries'], 'Spider-SQL': ['chinook_1_queries', 'college_2_queries', 'store_1_queries'], 'LitSearch': ['queries'], 'CAsT_2019': ['queries'], 'CAsT_2020': ['queries'], 'CAsT_2021': ['queries'], 'CAsT_2022': ['queries'], 'Core_2017': ['queries'], 'Microblog_2011': ['queries'], 'Microblog_2012': ['queries'], 'Microblog_2013': ['queries'], 'Microblog_2014': ['queries'], 'PrecisionMedicine_2017': ['queries'], 'PrecisionMedicine_2018': ['queries'], 'PrecisionMedicine_2019': ['queries'], 'PrecisionMedicine-Article_2019': ['queries'], 'PrecisionMedicine-Article_2020': ['queries'], 'CliniDS_2014': ['queries'], 'CliniDS_2015': ['queries'], 'CliniDS_2016': ['queries'], 'ClinicalTrials_2021': ['queries'], 'ClinicalTrials_2022': ['queries'], 'ClinicalTrials_2023': ['queries'], 'DD_2015': ['queries'], 'DD_2016': ['queries'], 'DD_2017': ['queries'], 'FairRanking_2020': ['queries'], 'FairRanking_2021': ['queries'], 'FairRanking_2022': ['queries'], 'Genomics-AdHoc_2004': ['queries'], 'Genomics-AdHoc_2005': ['queries'], 'Genomics-AdHoc_2006': ['queries'], 'Genomics-AdHoc_2007': ['queries'], 'TREC-Legal_2011': ['queries'], 'NeuCLIR-Tech_2023': ['queries'], 'NeuCLIR_2022': ['queries'], 'NeuCLIR_2023': ['queries'], 'ProductSearch_2023': ['queries'], 'ToT_2023': ['queries'], 'ToT_2024': ['queries'], 'FoodAPI': ['queries'], 'HuggingfaceAPI': ['queries'], 'PytorchAPI': ['queries'], 'SpotifyAPI': ['queries'], 'TMDB': ['queries'], 'TensorAPI': ['queries'], 'ToolBench': ['queries'], 'WeatherAPI': ['queries'], 'ExcluIR': ['queries'], 'Core17': ['queries'], 'News21': ['queries'], 'Robust04': ['queries'], 'InstructIR': ['queries'], 'NevIR': ['queries'], 'IFEval': ['detectable_format__number_bullet_lists_2078_queries', 'detectable_format__number_bullet_lists_102_queries', 'detectable_format__number_bullet_lists_2195_queries', 'detectable_format__number_bullet_lists_2314_queries', 'detectable_format__number_bullet_lists_1934_queries', 'detectable_format__number_bullet_lists_2667_queries', 'detectable_format__number_bullet_lists_1634_queries', 'detectable_format__number_bullet_lists_2100_queries', 'detectable_format__number_bullet_lists_1286_queries', 'detectable_format__number_bullet_lists_2457_queries', 'keywords__letter_frequency_1130_queries', 'keywords__letter_frequency_2107_queries', 'keywords__letter_frequency_1964_queries', 'keywords__letter_frequency_2265_queries', 'detectable_format__constrained_response_3752_queries', 'detectable_format__constrained_response_3755_queries', 'detectable_format__constrained_response_3754_queries', 'detectable_format__constrained_response_3753_queries', 'detectable_format__constrained_response_227_queries', 'detectable_format__constrained_response_3749_queries', 'detectable_format__constrained_response_3756_queries', 'detectable_format__constrained_response_3751_queries', 'detectable_format__constrained_response_3750_queries', 'detectable_format__constrained_response_3757_queries', 'punctuation__no_comma_2245_queries', 'punctuation__no_comma_1107_queries', 'punctuation__no_comma_1162_queries', 'punctuation__no_comma_1418_queries', 'punctuation__no_comma_1001_queries', 'punctuation__no_comma_1187_queries', 'punctuation__no_comma_1738_queries', 'punctuation__no_comma_1300_queries', 'punctuation__no_comma_2069_queries', 'punctuation__no_comma_1643_queries', 'keywords__existence_3156_queries', 'keywords__existence_2485_queries', 'keywords__existence_1531_queries', 'keywords__existence_3732_queries', 'keywords__existence_2662_queries', 'change_case__english_capital_2341_queries', 'change_case__english_capital_3186_queries', 'change_case__english_capital_2563_queries', 'change_case__english_capital_1999_queries', 'change_case__english_capital_24_queries', 'change_case__english_capital_1645_queries', 'change_case__english_lowercase_1122_queries', 'change_case__english_lowercase_1361_queries', 'change_case__english_lowercase_1019_queries', 'change_case__english_lowercase_1087_queries', 'change_case__english_lowercase_1667_queries', 'change_case__english_lowercase_1516_queries', 'change_case__english_lowercase_1535_queries', 'change_case__english_lowercase_1593_queries', 'change_case__english_lowercase_1843_queries', 'keywords__frequency_1393_queries', 'keywords__frequency_1733_queries', 'keywords__frequency_2142_queries', 'keywords__frequency_2292_queries', 'keywords__frequency_1498_queries', 'keywords__frequency_1203_queries', 'keywords__frequency_1857_queries', 'length_constraints__number_sentences_1837_queries', 'length_constraints__number_sentences_2674_queries', 'length_constraints__number_sentences_2617_queries', 'length_constraints__number_sentences_1381_queries', 'length_constraints__number_sentences_2266_queries', 'length_constraints__number_sentences_1268_queries', 'length_constraints__number_sentences_179_queries', 'length_constraints__number_paragraphs_1236_queries', 'length_constraints__number_paragraphs_2941_queries', 'length_constraints__number_paragraphs_1248_queries', 'length_constraints__number_paragraphs_1858_queries', 'length_constraints__number_paragraphs_1377_queries', 'length_constraints__number_paragraphs_2357_queries', 'length_constraints__number_paragraphs_2921_queries', 'length_constraints__number_paragraphs_1082_queries', 'length_constraints__number_paragraphs_2467_queries', 'combination__two_responses_1591_queries', 'combination__two_responses_1793_queries', 'combination__two_responses_2912_queries', 'combination__two_responses_1332_queries', 'combination__two_responses_2383_queries', 'combination__two_responses_136_queries', 'combination__two_responses_1098_queries', 'combination__two_responses_1746_queries', 'combination__two_responses_247_queries', 'combination__two_responses_2918_queries', 'detectable_content__postscript_2273_queries', 'detectable_content__postscript_2070_queries', 'detectable_content__postscript_1800_queries', 'detectable_content__postscript_1305_queries', 'detectable_content__postscript_1759_queries', 'detectable_content__postscript_1367_queries', 'detectable_content__postscript_1537_queries', 'detectable_content__postscript_1879_queries', 'detectable_content__postscript_1246_queries', 'detectable_content__postscript_1620_queries', 'startend__end_checker_2398_queries', 'startend__end_checker_1902_queries', 'startend__end_checker_2268_queries', 'startend__end_checker_1659_queries', 'startend__end_checker_1893_queries', 'startend__end_checker_2475_queries', 'startend__end_checker_1128_queries', 'startend__end_checker_1939_queries', 'startend__end_checker_1446_queries', 'startend__end_checker_1220_queries', 'detectable_content__number_placeholders_3280_queries', 'detectable_content__number_placeholders_1372_queries', 'detectable_content__number_placeholders_3221_queries', 'detectable_content__number_placeholders_1927_queries', 'detectable_content__number_placeholders_3126_queries', 'detectable_content__number_placeholders_2164_queries', 'detectable_content__number_placeholders_2136_queries', 'detectable_content__number_placeholders_2304_queries', 'detectable_content__number_placeholders_3743_queries', 'length_constraints__number_words_2323_queries', 'length_constraints__number_words_1072_queries', 'length_constraints__number_words_1258_queries', 'length_constraints__number_words_1251_queries', 'length_constraints__number_words_164_queries', 'detectable_format__number_highlighted_sections_168_queries', 'detectable_format__number_highlighted_sections_1237_queries', 'detectable_format__number_highlighted_sections_1601_queries', 'detectable_format__number_highlighted_sections_167_queries', 'detectable_format__number_highlighted_sections_1773_queries', 'detectable_format__number_highlighted_sections_1646_queries', 'detectable_format__number_highlighted_sections_1379_queries', 'detectable_format__number_highlighted_sections_1307_queries', 'detectable_format__number_highlighted_sections_1886_queries', 'detectable_format__number_highlighted_sections_1644_queries', 'detectable_format__json_format_1094_queries', 'detectable_format__json_format_1148_queries', 'detectable_format__json_format_1137_queries', 'detectable_format__json_format_1075_queries', 'detectable_format__json_format_2857_queries', 'detectable_format__json_format_3223_queries', 'detectable_format__json_format_2404_queries', 'detectable_format__json_format_321_queries', 'detectable_format__json_format_13_queries', 'change_case__capital_word_frequency_2820_queries', 'change_case__capital_word_frequency_2849_queries', 'change_case__capital_word_frequency_2870_queries', 'change_case__capital_word_frequency_1592_queries', 'detectable_format__multiple_sections_2023_queries', 'detectable_format__multiple_sections_1548_queries', 'detectable_format__multiple_sections_2925_queries', 'detectable_format__multiple_sections_1131_queries', 'detectable_format__multiple_sections_357_queries', 'startend__quotation_2015_queries', 'startend__quotation_219_queries', 'startend__quotation_2010_queries', 'startend__quotation_1658_queries', 'startend__quotation_1325_queries', 'startend__quotation_1776_queries', 'startend__quotation_2239_queries', 'startend__quotation_1845_queries', 'startend__quotation_2209_queries', 'length_constraints__nth_paragraph_first_word_2880_queries', 'length_constraints__nth_paragraph_first_word_181_queries', 'length_constraints__nth_paragraph_first_word_2250_queries', 'length_constraints__nth_paragraph_first_word_2215_queries', 'length_constraints__nth_paragraph_first_word_3073_queries', 'length_constraints__nth_paragraph_first_word_2590_queries', 'length_constraints__nth_paragraph_first_word_3624_queries', 'length_constraints__nth_paragraph_first_word_1954_queries', 'detectable_format__title_1262_queries', 'detectable_format__title_2229_queries', 'detectable_format__title_295_queries', 'detectable_format__title_2097_queries', 'detectable_format__title_1802_queries', 'detectable_format__title_1322_queries', 'detectable_format__title_2969_queries', 'detectable_format__title_3057_queries', 'detectable_format__title_1551_queries', 'detectable_format__title_2807_queries']} +TASK2SPLIT = json.loads('{"Competition-Math": ["queries"], "ProofWiki_Proof": ["queries"], "ProofWiki_Reference": ["queries"], "Stacks_Proof": ["queries"], "Stacks_Reference": ["queries"], "Stein_Proof": ["queries"], "Stein_Reference": ["queries"], "Trench_Proof": ["queries"], "Trench_Reference": ["queries"], "TAD": ["queries"], "TAS2": ["queries"], "StackMathQA": ["queries"], "APPS": ["queries"], "CodeEditSearch": ["queries"], "CodeSearchNet": ["queries"], "Conala": ["queries"], "HumanEval-X": ["queries"], "LeetCode": ["queries"], "MBPP": ["queries"], "RepoBench": ["queries"], "TLDR": ["queries"], "SWE-Bench-Lite": ["astropy__astropy_12544_queries", "astropy__astropy_13158_queries", "astropy__astropy_13162_queries", "astropy__astropy_13398_queries", "astropy__astropy_13438_queries", "astropy__astropy_14439_queries", "astropy__astropy_14701_queries", "astropy__astropy_14966_queries", "astropy__astropy_7441_queries", "astropy__astropy_8707_queries", "django__django_11501_queries", "django__django_12091_queries", "django__django_13192_queries", "django__django_13218_queries", "django__django_13884_queries", "django__django_14441_queries", "django__django_15481_queries", "django__django_15869_queries", "django__django_16901_queries", "django__django_17065_queries", "matplotlib__matplotlib_20518_queries", "matplotlib__matplotlib_23314_queries", "matplotlib__matplotlib_23913_queries", "matplotlib__matplotlib_24627_queries", "matplotlib__matplotlib_24849_queries", "matplotlib__matplotlib_25027_queries", "matplotlib__matplotlib_25238_queries", "matplotlib__matplotlib_25404_queries", "matplotlib__matplotlib_25430_queries", "matplotlib__matplotlib_25746_queries", "mwaskom__seaborn_2389_queries", "mwaskom__seaborn_2576_queries", "mwaskom__seaborn_2766_queries", "mwaskom__seaborn_2813_queries", "mwaskom__seaborn_2853_queries", "mwaskom__seaborn_2946_queries", "mwaskom__seaborn_2979_queries", "mwaskom__seaborn_2996_queries", "mwaskom__seaborn_3202_queries", "mwaskom__seaborn_3407_queries", "pallets__flask_4045_queries", "pallets__flask_4074_queries", "pallets__flask_4160_queries", "pallets__flask_4169_queries", "pallets__flask_4544_queries", "pallets__flask_4575_queries", "pallets__flask_4642_queries", "pallets__flask_4992_queries", "pallets__flask_5014_queries", "pallets__flask_5063_queries", "psf__requests_1537_queries", "psf__requests_1713_queries", "psf__requests_1733_queries", "psf__requests_1766_queries", "psf__requests_2193_queries", "psf__requests_2466_queries", "psf__requests_2821_queries", "psf__requests_3362_queries", "psf__requests_5414_queries", "psf__requests_863_queries", "pydata__xarray_4339_queries", "pydata__xarray_4767_queries", "pydata__xarray_4827_queries", "pydata__xarray_4911_queries", "pydata__xarray_4966_queries", "pydata__xarray_5033_queries", "pydata__xarray_5682_queries", "pydata__xarray_6135_queries", "pydata__xarray_6461_queries", "pydata__xarray_7391_queries", "pylint_dev__pylint_4398_queries", "pylint_dev__pylint_4604_queries", "pylint_dev__pylint_5175_queries", "pylint_dev__pylint_5446_queries", "pylint_dev__pylint_5613_queries", "pylint_dev__pylint_6358_queries", "pylint_dev__pylint_6412_queries", "pylint_dev__pylint_6556_queries", "pylint_dev__pylint_8281_queries", "pylint_dev__pylint_8757_queries", "pytest_dev__pytest_10371_queries", "pytest_dev__pytest_11047_queries", "pytest_dev__pytest_11148_queries", "pytest_dev__pytest_5356_queries", "pytest_dev__pytest_6680_queries", "pytest_dev__pytest_7158_queries", "pytest_dev__pytest_7352_queries", "pytest_dev__pytest_9064_queries", "pytest_dev__pytest_9279_queries", "scikit_learn__scikit_learn_10198_queries", "scikit_learn__scikit_learn_10803_queries", "scikit_learn__scikit_learn_10949_queries", "scikit_learn__scikit_learn_11333_queries", "scikit_learn__scikit_learn_11635_queries", "scikit_learn__scikit_learn_12827_queries", "scikit_learn__scikit_learn_12834_queries", "scikit_learn__scikit_learn_13302_queries", "scikit_learn__scikit_learn_13392_queries", "scikit_learn__scikit_learn_13779_queries", "sphinx_doc__sphinx_11312_queries", "sphinx_doc__sphinx_11502_queries", "sphinx_doc__sphinx_7356_queries", "sphinx_doc__sphinx_7590_queries", "sphinx_doc__sphinx_7757_queries", "sphinx_doc__sphinx_7831_queries", "sphinx_doc__sphinx_8125_queries", "sphinx_doc__sphinx_8863_queries", "sphinx_doc__sphinx_9309_queries", "sphinx_doc__sphinx_9828_queries", "sympy__sympy_13091_queries", "sympy__sympy_14817_queries", "sympy__sympy_14821_queries", "sympy__sympy_15151_queries", "sympy__sympy_15933_queries", "sympy__sympy_16493_queries", "sympy__sympy_16858_queries", "sympy__sympy_17251_queries", "sympy__sympy_18532_queries", "sympy__sympy_20212_queries"], "Apple": ["queries"], "ConvFinQA": ["queries"], "FinQA": ["queries"], "FinanceBench": ["queries"], "HC3Finance": ["queries"], "TAT-DQA": ["queries"], "Trade-the-event": ["queries"], "AY2": ["queries"], "ELI5": ["queries"], "Fever": ["queries"], "TREx": ["queries"], "WnCw": ["queries"], "WnWi": ["queries"], "WoW": ["queries"], "zsRE": ["queries"], "AILA2019-Case": ["queries"], "AILA2019-Statutes": ["queries"], "BSARD": ["queries"], "BillSum": ["queries"], "CUAD": ["GOOSEHEADINSURANCE_queries", "GRANTIERRAENERGY_queries", "HarpoonTherapeutics_queries", "Monsanto_Company_queries"], "GerDaLIR": ["queries"], "LeCaRDv2": ["queries"], "LegalQuAD": ["queries"], "REGIR-EU2UK": ["queries"], "REGIR-UK2EU": ["queries"], "ArguAna": ["queries"], "CQADupStack": ["CQADupStack_Android_queries", "CQADupStack_English_queries", "CQADupStack_Gaming_queries", "CQADupStack_Gis_queries", "CQADupStack_Math_queries", "CQADupStack_Physics_queries", "CQADupStack_Programmers_queries", "CQADupStack_Stats_queries", "CQADupStack_Tex_queries", "CQADupStack_Unix_queries", "CQADupStack_WebMasters_queries", "CQADupStack_Wordpress_queries"], "FiQA": ["queries"], "NFCorpus": ["queries"], "Quora": ["queries"], "SciDocs": ["queries"], "SciFact": ["queries"], "TopiOCQA": ["queries"], "Touche": ["queries"], "Trec-Covid": ["queries"], "ACORDAR": ["queries"], "CPCD": ["queries"], "ChroniclingAmericaQA": ["queries"], "Monant": ["queries"], "NTCIR": ["queries"], "PointRec": ["queries"], "ProCIS-Dialog": ["queries"], "ProCIS-Turn": ["queries"], "QuanTemp": ["queries"], "WebTableSearch": ["queries"], "CARE": ["queries"], "MISeD": ["Bmr006_queries", "Bro027_queries", "covid4_queries", "covid9_queries", "education4_queries"], "SParC": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "SParC-SQL": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "Spider": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "Spider-SQL": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "LitSearch": ["queries"], "CAsT_2019": ["queries"], "CAsT_2020": ["queries"], "CAsT_2021": ["queries"], "CAsT_2022": ["queries"], "Core_2017": ["queries"], "Microblog_2011": ["queries"], "Microblog_2012": ["queries"], "Microblog_2013": ["queries"], "Microblog_2014": ["queries"], "PrecisionMedicine_2017": ["queries"], "PrecisionMedicine_2018": ["queries"], "PrecisionMedicine_2019": ["queries"], "PrecisionMedicine-Article_2019": ["queries"], "PrecisionMedicine-Article_2020": ["queries"], "CliniDS_2014": ["queries"], "CliniDS_2015": ["queries"], "CliniDS_2016": ["queries"], "ClinicalTrials_2021": ["queries"], "ClinicalTrials_2022": ["queries"], "ClinicalTrials_2023": ["queries"], "DD_2015": ["queries"], "DD_2016": ["queries"], "DD_2017": ["queries"], "FairRanking_2020": ["queries"], "FairRanking_2021": ["queries"], "FairRanking_2022": ["queries"], "Genomics-AdHoc_2004": ["queries"], "Genomics-AdHoc_2005": ["queries"], "Genomics-AdHoc_2006": ["queries"], "Genomics-AdHoc_2007": ["queries"], "TREC-Legal_2011": ["queries"], "NeuCLIR-Tech_2023": ["queries"], "NeuCLIR_2022": ["queries"], "NeuCLIR_2023": ["queries"], "ProductSearch_2023": ["queries"], "ToT_2023": ["queries"], "ToT_2024": ["queries"], "FoodAPI": ["queries"], "HuggingfaceAPI": ["queries"], "PytorchAPI": ["queries"], "SpotifyAPI": ["queries"], "TMDB": ["queries"], "TensorAPI": ["queries"], "ToolBench": ["queries"], "WeatherAPI": ["queries"], "ExcluIR": ["queries"], "Core17": ["queries"], "News21": ["queries"], "Robust04": ["queries"], "InstructIR": ["queries"], "NevIR": ["queries"], "IFEval": ["detectable_format__number_bullet_lists_2078_queries", "detectable_format__number_bullet_lists_102_queries", "detectable_format__number_bullet_lists_2195_queries", "detectable_format__number_bullet_lists_2314_queries", "detectable_format__number_bullet_lists_1934_queries", "detectable_format__number_bullet_lists_2667_queries", "detectable_format__number_bullet_lists_1634_queries", "detectable_format__number_bullet_lists_2100_queries", "detectable_format__number_bullet_lists_1286_queries", "detectable_format__number_bullet_lists_2457_queries", "keywords__letter_frequency_1130_queries", "keywords__letter_frequency_2107_queries", "keywords__letter_frequency_1964_queries", "keywords__letter_frequency_2265_queries", "detectable_format__constrained_response_3752_queries", "detectable_format__constrained_response_3755_queries", "detectable_format__constrained_response_3754_queries", "detectable_format__constrained_response_3753_queries", "detectable_format__constrained_response_227_queries", "detectable_format__constrained_response_3749_queries", "detectable_format__constrained_response_3756_queries", "detectable_format__constrained_response_3751_queries", "detectable_format__constrained_response_3750_queries", "detectable_format__constrained_response_3757_queries", "punctuation__no_comma_2245_queries", "punctuation__no_comma_1107_queries", "punctuation__no_comma_1162_queries", "punctuation__no_comma_1418_queries", "punctuation__no_comma_1001_queries", "punctuation__no_comma_1187_queries", "punctuation__no_comma_1738_queries", "punctuation__no_comma_1300_queries", "punctuation__no_comma_2069_queries", "punctuation__no_comma_1643_queries", "keywords__existence_3156_queries", "keywords__existence_2485_queries", "keywords__existence_1531_queries", "keywords__existence_3732_queries", "keywords__existence_2662_queries", "change_case__english_capital_2341_queries", "change_case__english_capital_3186_queries", "change_case__english_capital_2563_queries", "change_case__english_capital_1999_queries", "change_case__english_capital_24_queries", "change_case__english_capital_1645_queries", "change_case__english_lowercase_1122_queries", "change_case__english_lowercase_1361_queries", "change_case__english_lowercase_1019_queries", "change_case__english_lowercase_1087_queries", "change_case__english_lowercase_1667_queries", "change_case__english_lowercase_1516_queries", "change_case__english_lowercase_1535_queries", "change_case__english_lowercase_1593_queries", "change_case__english_lowercase_1843_queries", "keywords__frequency_1393_queries", "keywords__frequency_1733_queries", "keywords__frequency_2142_queries", "keywords__frequency_2292_queries", "keywords__frequency_1498_queries", "keywords__frequency_1203_queries", "keywords__frequency_1857_queries", "length_constraints__number_sentences_1837_queries", "length_constraints__number_sentences_2674_queries", "length_constraints__number_sentences_2617_queries", "length_constraints__number_sentences_1381_queries", "length_constraints__number_sentences_2266_queries", "length_constraints__number_sentences_1268_queries", "length_constraints__number_sentences_179_queries", "length_constraints__number_paragraphs_1236_queries", "length_constraints__number_paragraphs_2941_queries", "length_constraints__number_paragraphs_1248_queries", "length_constraints__number_paragraphs_1858_queries", "length_constraints__number_paragraphs_1377_queries", "length_constraints__number_paragraphs_2357_queries", "length_constraints__number_paragraphs_2921_queries", "length_constraints__number_paragraphs_1082_queries", "length_constraints__number_paragraphs_2467_queries", "combination__two_responses_1591_queries", "combination__two_responses_1793_queries", "combination__two_responses_2912_queries", "combination__two_responses_1332_queries", "combination__two_responses_2383_queries", "combination__two_responses_136_queries", "combination__two_responses_1098_queries", "combination__two_responses_1746_queries", "combination__two_responses_247_queries", "combination__two_responses_2918_queries", "detectable_content__postscript_2273_queries", "detectable_content__postscript_2070_queries", "detectable_content__postscript_1800_queries", "detectable_content__postscript_1305_queries", "detectable_content__postscript_1759_queries", "detectable_content__postscript_1367_queries", "detectable_content__postscript_1537_queries", "detectable_content__postscript_1879_queries", "detectable_content__postscript_1246_queries", "detectable_content__postscript_1620_queries", "startend__end_checker_2398_queries", "startend__end_checker_1902_queries", "startend__end_checker_2268_queries", "startend__end_checker_1659_queries", "startend__end_checker_1893_queries", "startend__end_checker_2475_queries", "startend__end_checker_1128_queries", "startend__end_checker_1939_queries", "startend__end_checker_1446_queries", "startend__end_checker_1220_queries", "detectable_content__number_placeholders_3280_queries", "detectable_content__number_placeholders_1372_queries", "detectable_content__number_placeholders_3221_queries", "detectable_content__number_placeholders_1927_queries", "detectable_content__number_placeholders_3126_queries", "detectable_content__number_placeholders_2164_queries", "detectable_content__number_placeholders_2136_queries", "detectable_content__number_placeholders_2304_queries", "detectable_content__number_placeholders_3743_queries", "length_constraints__number_words_2323_queries", "length_constraints__number_words_1072_queries", "length_constraints__number_words_1258_queries", "length_constraints__number_words_1251_queries", "length_constraints__number_words_164_queries", "detectable_format__number_highlighted_sections_168_queries", "detectable_format__number_highlighted_sections_1237_queries", "detectable_format__number_highlighted_sections_1601_queries", "detectable_format__number_highlighted_sections_167_queries", "detectable_format__number_highlighted_sections_1773_queries", "detectable_format__number_highlighted_sections_1646_queries", "detectable_format__number_highlighted_sections_1379_queries", "detectable_format__number_highlighted_sections_1307_queries", "detectable_format__number_highlighted_sections_1886_queries", "detectable_format__number_highlighted_sections_1644_queries", "detectable_format__json_format_1094_queries", "detectable_format__json_format_1148_queries", "detectable_format__json_format_1137_queries", "detectable_format__json_format_1075_queries", "detectable_format__json_format_2857_queries", "detectable_format__json_format_3223_queries", "detectable_format__json_format_2404_queries", "detectable_format__json_format_321_queries", "detectable_format__json_format_13_queries", "change_case__capital_word_frequency_2820_queries", "change_case__capital_word_frequency_2849_queries", "change_case__capital_word_frequency_2870_queries", "change_case__capital_word_frequency_1592_queries", "detectable_format__multiple_sections_2023_queries", "detectable_format__multiple_sections_1548_queries", "detectable_format__multiple_sections_2925_queries", "detectable_format__multiple_sections_1131_queries", "detectable_format__multiple_sections_357_queries", "startend__quotation_2015_queries", "startend__quotation_219_queries", "startend__quotation_2010_queries", "startend__quotation_1658_queries", "startend__quotation_1325_queries", "startend__quotation_1776_queries", "startend__quotation_2239_queries", "startend__quotation_1845_queries", "startend__quotation_2209_queries", "length_constraints__nth_paragraph_first_word_2880_queries", "length_constraints__nth_paragraph_first_word_181_queries", "length_constraints__nth_paragraph_first_word_2250_queries", "length_constraints__nth_paragraph_first_word_2215_queries", "length_constraints__nth_paragraph_first_word_3073_queries", "length_constraints__nth_paragraph_first_word_2590_queries", "length_constraints__nth_paragraph_first_word_3624_queries", "length_constraints__nth_paragraph_first_word_1954_queries", "detectable_format__title_1262_queries", "detectable_format__title_2229_queries", "detectable_format__title_295_queries", "detectable_format__title_2097_queries", "detectable_format__title_1802_queries", "detectable_format__title_1322_queries", "detectable_format__title_2969_queries", "detectable_format__title_3057_queries", "detectable_format__title_1551_queries", "detectable_format__title_2807_queries"]}') -MAIR_TASK_CONFIG = {'Competition-Math': 'Academic', 'ProofWiki_Proof': 'Academic', 'ProofWiki_Reference': 'Academic', 'Stacks_Proof': 'Academic', 'Stacks_Reference': 'Academic', 'Stein_Proof': 'Academic', 'Stein_Reference': 'Academic', 'Trench_Proof': 'Academic', 'Trench_Reference': 'Academic', 'TAD': 'Academic', 'TAS2': 'Academic', 'StackMathQA': 'Academic', 'APPS': 'Code', 'CodeEditSearch': 'Code', 'CodeSearchNet': 'Code', 'Conala': 'Code', 'HumanEval-X': 'Code', 'LeetCode': 'Code', 'MBPP': 'Code', 'RepoBench': 'Code', 'TLDR': 'Code', 'SWE-Bench-Lite': 'Code', 'Apple': 'Finance', 'ConvFinQA': 'Finance', 'FinQA': 'Finance', 'FinanceBench': 'Finance', 'HC3Finance': 'Finance', 'TAT-DQA': 'Finance', 'Trade-the-event': 'Finance', 'AY2': 'Web', 'ELI5': 'Web', 'Fever': 'Web', 'TREx': 'Web', 'WnCw': 'Web', 'WnWi': 'Web', 'WoW': 'Web', 'zsRE': 'Web', 'AILA2019-Case': 'Legal', 'AILA2019-Statutes': 'Legal', 'BSARD': 'Legal', 'BillSum': 'Legal', 'CUAD': 'Legal', 'GerDaLIR': 'Legal', 'LeCaRDv2': 'Legal', 'LegalQuAD': 'Legal', 'REGIR-EU2UK': 'Legal', 'REGIR-UK2EU': 'Legal', 'ArguAna': 'Web', 'CQADupStack': 'Web', 'FiQA': 'Finance', 'NFCorpus': 'Medical', 'Quora': 'Web', 'SciDocs': 'Academic', 'SciFact': 'Academic', 'TopiOCQA': 'Web', 'Touche': 'Web', 'Trec-Covid': 'Medical', 'ACORDAR': 'Web', 'CPCD': 'Web', 'ChroniclingAmericaQA': 'Web', 'Monant': 'Medical', 'NTCIR': 'Web', 'PointRec': 'Web', 'ProCIS-Dialog': 'Web', 'ProCIS-Turn': 'Web', 'QuanTemp': 'Web', 'WebTableSearch': 'Web', 'CARE': 'Medical', 'MISeD': 'Web', 'SParC': 'Web', 'SParC-SQL': 'Web', 'Spider': 'Web', 'Spider-SQL': 'Web', 'LitSearch': 'Academic', 'CAsT_2019': 'Web', 'CAsT_2020': 'Web', 'CAsT_2021': 'Web', 'CAsT_2022': 'Web', 'Core_2017': 'Web', 'Microblog_2011': 'Web', 'Microblog_2012': 'Web', 'Microblog_2013': 'Web', 'Microblog_2014': 'Web', 'PrecisionMedicine_2017': 'Medical', 'PrecisionMedicine_2018': 'Medical', 'PrecisionMedicine_2019': 'Medical', 'PrecisionMedicine-Article_2019': 'Medical', 'PrecisionMedicine-Article_2020': 'Medical', 'CliniDS_2014': 'Medical', 'CliniDS_2015': 'Medical', 'CliniDS_2016': 'Medical', 'ClinicalTrials_2021': 'Medical', 'ClinicalTrials_2022': 'Medical', 'ClinicalTrials_2023': 'Medical', 'DD_2015': 'Web', 'DD_2016': 'Web', 'DD_2017': 'Web', 'FairRanking_2020': 'Academic', 'FairRanking_2021': 'Web', 'FairRanking_2022': 'Web', 'Genomics-AdHoc_2004': 'Medical', 'Genomics-AdHoc_2005': 'Medical', 'Genomics-AdHoc_2006': 'Medical', 'Genomics-AdHoc_2007': 'Medical', 'TREC-Legal_2011': 'Legal', 'NeuCLIR-Tech_2023': 'Web', 'NeuCLIR_2022': 'Web', 'NeuCLIR_2023': 'Web', 'ProductSearch_2023': 'Web', 'ToT_2023': 'Web', 'ToT_2024': 'Web', 'FoodAPI': 'Code', 'HuggingfaceAPI': 'Code', 'PytorchAPI': 'Code', 'SpotifyAPI': 'Code', 'TMDB': 'Code', 'TensorAPI': 'Code', 'ToolBench': 'Code', 'WeatherAPI': 'Code', 'ExcluIR': 'Web', 'Core17': 'Web', 'News21': 'Web', 'Robust04': 'Web', 'InstructIR': 'Web', 'NevIR': 'Web', 'IFEval': 'Web'} +MAIR_TASK_CONFIG = json.loads('{"Competition-Math": "Academic", "ProofWiki_Proof": "Academic", "ProofWiki_Reference": "Academic", "Stacks_Proof": "Academic", "Stacks_Reference": "Academic", "Stein_Proof": "Academic", "Stein_Reference": "Academic", "Trench_Proof": "Academic", "Trench_Reference": "Academic", "TAD": "Academic", "TAS2": "Academic", "StackMathQA": "Academic", "APPS": "Code", "CodeEditSearch": "Code", "CodeSearchNet": "Code", "Conala": "Code", "HumanEval-X": "Code", "LeetCode": "Code", "MBPP": "Code", "RepoBench": "Code", "TLDR": "Code", "SWE-Bench-Lite": "Code", "Apple": "Finance", "ConvFinQA": "Finance", "FinQA": "Finance", "FinanceBench": "Finance", "HC3Finance": "Finance", "TAT-DQA": "Finance", "Trade-the-event": "Finance", "AY2": "Web", "ELI5": "Web", "Fever": "Web", "TREx": "Web", "WnCw": "Web", "WnWi": "Web", "WoW": "Web", "zsRE": "Web", "AILA2019-Case": "Legal", "AILA2019-Statutes": "Legal", "BSARD": "Legal", "BillSum": "Legal", "CUAD": "Legal", "GerDaLIR": "Legal", "LeCaRDv2": "Legal", "LegalQuAD": "Legal", "REGIR-EU2UK": "Legal", "REGIR-UK2EU": "Legal", "ArguAna": "Web", "CQADupStack": "Web", "FiQA": "Finance", "NFCorpus": "Medical", "Quora": "Web", "SciDocs": "Academic", "SciFact": "Academic", "TopiOCQA": "Web", "Touche": "Web", "Trec-Covid": "Medical", "ACORDAR": "Web", "CPCD": "Web", "ChroniclingAmericaQA": "Web", "Monant": "Medical", "NTCIR": "Web", "PointRec": "Web", "ProCIS-Dialog": "Web", "ProCIS-Turn": "Web", "QuanTemp": "Web", "WebTableSearch": "Web", "CARE": "Medical", "MISeD": "Web", "SParC": "Web", "SParC-SQL": "Web", "Spider": "Web", "Spider-SQL": "Web", "LitSearch": "Academic", "CAsT_2019": "Web", "CAsT_2020": "Web", "CAsT_2021": "Web", "CAsT_2022": "Web", "Core_2017": "Web", "Microblog_2011": "Web", "Microblog_2012": "Web", "Microblog_2013": "Web", "Microblog_2014": "Web", "PrecisionMedicine_2017": "Medical", "PrecisionMedicine_2018": "Medical", "PrecisionMedicine_2019": "Medical", "PrecisionMedicine-Article_2019": "Medical", "PrecisionMedicine-Article_2020": "Medical", "CliniDS_2014": "Medical", "CliniDS_2015": "Medical", "CliniDS_2016": "Medical", "ClinicalTrials_2021": "Medical", "ClinicalTrials_2022": "Medical", "ClinicalTrials_2023": "Medical", "DD_2015": "Web", "DD_2016": "Web", "DD_2017": "Web", "FairRanking_2020": "Academic", "FairRanking_2021": "Web", "FairRanking_2022": "Web", "Genomics-AdHoc_2004": "Medical", "Genomics-AdHoc_2005": "Medical", "Genomics-AdHoc_2006": "Medical", "Genomics-AdHoc_2007": "Medical", "TREC-Legal_2011": "Legal", "NeuCLIR-Tech_2023": "Web", "NeuCLIR_2022": "Web", "NeuCLIR_2023": "Web", "ProductSearch_2023": "Web", "ToT_2023": "Web", "ToT_2024": "Web", "FoodAPI": "Code", "HuggingfaceAPI": "Code", "PytorchAPI": "Code", "SpotifyAPI": "Code", "TMDB": "Code", "TensorAPI": "Code", "ToolBench": "Code", "WeatherAPI": "Code", "ExcluIR": "Web", "Core17": "Web", "News21": "Web", "Robust04": "Web", "InstructIR": "Web", "NevIR": "Web", "IFEval": "Web"}') _MAIR_CITATION = """@inproceedings{Sun2024MAIR, title={MAIR: A Massive Benchmark for Evaluating Instructed Retrieval}, @@ -50,32 +51,33 @@ def load_data(self, **kwargs): return self.corpus, self.queries, self.relevant_docs = {}, {}, {} queries_path = self.metadata_dict["dataset"]["path"] - docs_path = self.metadata_dict["dataset"]["path"].replace('-Queries', '-Docs') + docs_path = self.metadata_dict["dataset"]["path"].replace("-Queries", "-Docs") task_name = self.metadata.name.replace("MAIR-", "") query_ds = datasets.load_dataset(queries_path, task_name) corpus_ds = datasets.load_dataset(docs_path, task_name) self.metadata.eval_splits = [] for split in query_ds: - doc_split = 'docs' if split == 'queries' else split.replace('_queries', '_docs') - self.queries[split] = {item['qid']: item['query'] for item in query_ds[split]} - self.corpus[split] = {item['id']: {'title': '', 'text': item['doc']} for item in corpus_ds[doc_split]} + doc_split = "docs" if split == "queries" else split.replace("_queries", "_docs") + self.queries[split] = {item["qid"]: item["query"] for item in query_ds[split]} + self.corpus[split] = { + item["id"]: {"title": "", "text": item["doc"]} + for item in corpus_ds[doc_split] + } self.relevant_docs[split] = { - item['qid']: {d['id']: d['score'] for d in item['labels']} + item["qid"]: {d["id"]: d["score"] for d in item["labels"]} for item in query_ds[split] } self.metadata.eval_splits.append(split) self.data_loaded = True + for _task in TASK2SPLIT.keys(): - _class_name = _task.replace('-', '_') + _class_name = _task.replace("-", "_") _new_class = type( _class_name, (AbsTaskRetrieval,), - { - 'metadata': get_metadata(_task), - 'load_data': load_data - } + {"metadata": get_metadata(_task), "load_data": load_data}, ) globals()[_class_name] = _new_class __all__.append(_class_name) From bc53ba3de210d534b867fed4efa27fee4e8ae0a5 Mon Sep 17 00:00:00 2001 From: Weiwei Sun <68775773+sunnweiwei@users.noreply.github.com> Date: Sun, 10 Nov 2024 02:25:11 -0500 Subject: [PATCH 08/11] Update benchmarks.py --- mteb/benchmarks/benchmarks.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index cd0d584df..a14dbb586 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json from collections.abc import Sequence from dataclasses import dataclass from typing import Annotated @@ -920,11 +921,10 @@ def load_results( ) -MAIR_TASK_CONFIG = {'Competition-Math': 'Academic', 'ProofWiki_Proof': 'Academic', 'ProofWiki_Reference': 'Academic', 'Stacks_Proof': 'Academic', 'Stacks_Reference': 'Academic', 'Stein_Proof': 'Academic', 'Stein_Reference': 'Academic', 'Trench_Proof': 'Academic', 'Trench_Reference': 'Academic', 'TAD': 'Academic', 'TAS2': 'Academic', 'StackMathQA': 'Academic', 'APPS': 'Code', 'CodeEditSearch': 'Code', 'CodeSearchNet': 'Code', 'Conala': 'Code', 'HumanEval-X': 'Code', 'LeetCode': 'Code', 'MBPP': 'Code', 'RepoBench': 'Code', 'TLDR': 'Code', 'SWE-Bench-Lite': 'Code', 'Apple': 'Finance', 'ConvFinQA': 'Finance', 'FinQA': 'Finance', 'FinanceBench': 'Finance', 'HC3Finance': 'Finance', 'TAT-DQA': 'Finance', 'Trade-the-event': 'Finance', 'AY2': 'Web', 'ELI5': 'Web', 'Fever': 'Web', 'TREx': 'Web', 'WnCw': 'Web', 'WnWi': 'Web', 'WoW': 'Web', 'zsRE': 'Web', 'AILA2019-Case': 'Legal', 'AILA2019-Statutes': 'Legal', 'BSARD': 'Legal', 'BillSum': 'Legal', 'CUAD': 'Legal', 'GerDaLIR': 'Legal', 'LeCaRDv2': 'Legal', 'LegalQuAD': 'Legal', 'REGIR-EU2UK': 'Legal', 'REGIR-UK2EU': 'Legal', 'ArguAna': 'Web', 'CQADupStack': 'Web', 'FiQA': 'Finance', 'NFCorpus': 'Medical', 'Quora': 'Web', 'SciDocs': 'Academic', 'SciFact': 'Academic', 'TopiOCQA': 'Web', 'Touche': 'Web', 'Trec-Covid': 'Medical', 'ACORDAR': 'Web', 'CPCD': 'Web', 'ChroniclingAmericaQA': 'Web', 'Monant': 'Medical', 'NTCIR': 'Web', 'PointRec': 'Web', 'ProCIS-Dialog': 'Web', 'ProCIS-Turn': 'Web', 'QuanTemp': 'Web', 'WebTableSearch': 'Web', 'CARE': 'Medical', 'MISeD': 'Web', 'SParC': 'Web', 'SParC-SQL': 'Web', 'Spider': 'Web', 'Spider-SQL': 'Web', 'LitSearch': 'Academic', 'CAsT_2019': 'Web', 'CAsT_2020': 'Web', 'CAsT_2021': 'Web', 'CAsT_2022': 'Web', 'Core_2017': 'Web', 'Microblog_2011': 'Web', 'Microblog_2012': 'Web', 'Microblog_2013': 'Web', 'Microblog_2014': 'Web', 'PrecisionMedicine_2017': 'Medical', 'PrecisionMedicine_2018': 'Medical', 'PrecisionMedicine_2019': 'Medical', 'PrecisionMedicine-Article_2019': 'Medical', 'PrecisionMedicine-Article_2020': 'Medical', 'CliniDS_2014': 'Medical', 'CliniDS_2015': 'Medical', 'CliniDS_2016': 'Medical', 'ClinicalTrials_2021': 'Medical', 'ClinicalTrials_2022': 'Medical', 'ClinicalTrials_2023': 'Medical', 'DD_2015': 'Web', 'DD_2016': 'Web', 'DD_2017': 'Web', 'FairRanking_2020': 'Academic', 'FairRanking_2021': 'Web', 'FairRanking_2022': 'Web', 'Genomics-AdHoc_2004': 'Medical', 'Genomics-AdHoc_2005': 'Medical', 'Genomics-AdHoc_2006': 'Medical', 'Genomics-AdHoc_2007': 'Medical', 'TREC-Legal_2011': 'Legal', 'NeuCLIR-Tech_2023': 'Web', 'NeuCLIR_2022': 'Web', 'NeuCLIR_2023': 'Web', 'ProductSearch_2023': 'Web', 'ToT_2023': 'Web', 'ToT_2024': 'Web', 'FoodAPI': 'Code', 'HuggingfaceAPI': 'Code', 'PytorchAPI': 'Code', 'SpotifyAPI': 'Code', 'TMDB': 'Code', 'TensorAPI': 'Code', 'ToolBench': 'Code', 'WeatherAPI': 'Code', 'ExcluIR': 'Web', 'Core17': 'Web', 'News21': 'Web', 'Robust04': 'Web', 'InstructIR': 'Web', 'NevIR': 'Web', 'IFEval': 'Web'} - +MAIR_TASK_CONFIG = json.loads('{"Competition-Math": "Academic", "ProofWiki_Proof": "Academic", "ProofWiki_Reference": "Academic", "Stacks_Proof": "Academic", "Stacks_Reference": "Academic", "Stein_Proof": "Academic", "Stein_Reference": "Academic", "Trench_Proof": "Academic", "Trench_Reference": "Academic", "TAD": "Academic", "TAS2": "Academic", "StackMathQA": "Academic", "APPS": "Code", "CodeEditSearch": "Code", "CodeSearchNet": "Code", "Conala": "Code", "HumanEval-X": "Code", "LeetCode": "Code", "MBPP": "Code", "RepoBench": "Code", "TLDR": "Code", "SWE-Bench-Lite": "Code", "Apple": "Finance", "ConvFinQA": "Finance", "FinQA": "Finance", "FinanceBench": "Finance", "HC3Finance": "Finance", "TAT-DQA": "Finance", "Trade-the-event": "Finance", "AY2": "Web", "ELI5": "Web", "Fever": "Web", "TREx": "Web", "WnCw": "Web", "WnWi": "Web", "WoW": "Web", "zsRE": "Web", "AILA2019-Case": "Legal", "AILA2019-Statutes": "Legal", "BSARD": "Legal", "BillSum": "Legal", "CUAD": "Legal", "GerDaLIR": "Legal", "LeCaRDv2": "Legal", "LegalQuAD": "Legal", "REGIR-EU2UK": "Legal", "REGIR-UK2EU": "Legal", "ArguAna": "Web", "CQADupStack": "Web", "FiQA": "Finance", "NFCorpus": "Medical", "Quora": "Web", "SciDocs": "Academic", "SciFact": "Academic", "TopiOCQA": "Web", "Touche": "Web", "Trec-Covid": "Medical", "ACORDAR": "Web", "CPCD": "Web", "ChroniclingAmericaQA": "Web", "Monant": "Medical", "NTCIR": "Web", "PointRec": "Web", "ProCIS-Dialog": "Web", "ProCIS-Turn": "Web", "QuanTemp": "Web", "WebTableSearch": "Web", "CARE": "Medical", "MISeD": "Web", "SParC": "Web", "SParC-SQL": "Web", "Spider": "Web", "Spider-SQL": "Web", "LitSearch": "Academic", "CAsT_2019": "Web", "CAsT_2020": "Web", "CAsT_2021": "Web", "CAsT_2022": "Web", "Core_2017": "Web", "Microblog_2011": "Web", "Microblog_2012": "Web", "Microblog_2013": "Web", "Microblog_2014": "Web", "PrecisionMedicine_2017": "Medical", "PrecisionMedicine_2018": "Medical", "PrecisionMedicine_2019": "Medical", "PrecisionMedicine-Article_2019": "Medical", "PrecisionMedicine-Article_2020": "Medical", "CliniDS_2014": "Medical", "CliniDS_2015": "Medical", "CliniDS_2016": "Medical", "ClinicalTrials_2021": "Medical", "ClinicalTrials_2022": "Medical", "ClinicalTrials_2023": "Medical", "DD_2015": "Web", "DD_2016": "Web", "DD_2017": "Web", "FairRanking_2020": "Academic", "FairRanking_2021": "Web", "FairRanking_2022": "Web", "Genomics-AdHoc_2004": "Medical", "Genomics-AdHoc_2005": "Medical", "Genomics-AdHoc_2006": "Medical", "Genomics-AdHoc_2007": "Medical", "TREC-Legal_2011": "Legal", "NeuCLIR-Tech_2023": "Web", "NeuCLIR_2022": "Web", "NeuCLIR_2023": "Web", "ProductSearch_2023": "Web", "ToT_2023": "Web", "ToT_2024": "Web", "FoodAPI": "Code", "HuggingfaceAPI": "Code", "PytorchAPI": "Code", "SpotifyAPI": "Code", "TMDB": "Code", "TensorAPI": "Code", "ToolBench": "Code", "WeatherAPI": "Code", "ExcluIR": "Web", "Core17": "Web", "News21": "Web", "Robust04": "Web", "InstructIR": "Web", "NevIR": "Web", "IFEval": "Web"}') def _get_mair_tasks_by_domain(domain): - assert domain in ['Academic', 'Code', 'Web', 'Legal', 'Medical', 'Finance'] + assert domain in ["Academic", "Code", "Web", "Legal", "Medical", "Finance"] out = [] for task in MAIR_TASK_CONFIG: if MAIR_TASK_CONFIG[task] == domain: @@ -945,9 +945,7 @@ def _get_mair_all_tasks(): MAIR = Benchmark( name="MAIR", - tasks=get_tasks( - tasks=['MAIR-' + name for name in _get_mair_all_tasks()] - ), + tasks=get_tasks(tasks=["MAIR-" + name for name in _get_mair_all_tasks()]), description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", reference="https://github.com/sunnweiwei/MAIR", citation=_MAIR_CITATION, @@ -956,7 +954,7 @@ def _get_mair_all_tasks(): MAIR_WEB = Benchmark( name="MAIR(Web)", tasks=get_tasks( - tasks=['MAIR-' + name for name in _get_mair_tasks_by_domain('Web')] + tasks=["MAIR-" + name for name in _get_mair_tasks_by_domain("Web")] ), description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", reference="https://github.com/sunnweiwei/MAIR", @@ -966,7 +964,7 @@ def _get_mair_all_tasks(): MAIR_CODE = Benchmark( name="MAIR(Code)", tasks=get_tasks( - tasks=['MAIR-' + name for name in _get_mair_tasks_by_domain('Code')] + tasks=["MAIR-" + name for name in _get_mair_tasks_by_domain("Code")] ), description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", reference="https://github.com/sunnweiwei/MAIR", @@ -976,7 +974,7 @@ def _get_mair_all_tasks(): MAIR_ACADEMIC = Benchmark( name="MAIR(Academic)", tasks=get_tasks( - tasks=['MAIR-' + name for name in _get_mair_tasks_by_domain('Academic')] + tasks=["MAIR-" + name for name in _get_mair_tasks_by_domain("Academic")] ), description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", reference="https://github.com/sunnweiwei/MAIR", @@ -986,7 +984,7 @@ def _get_mair_all_tasks(): MAIR_LEGAL = Benchmark( name="MAIR(Legal)", tasks=get_tasks( - tasks=['MAIR-' + name for name in _get_mair_tasks_by_domain('Legal')] + tasks=["MAIR-" + name for name in _get_mair_tasks_by_domain("Legal")] ), description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", reference="https://github.com/sunnweiwei/MAIR", @@ -996,7 +994,7 @@ def _get_mair_all_tasks(): MAIR_MEDICAL = Benchmark( name="MAIR(Medical)", tasks=get_tasks( - tasks=['MAIR-' + name for name in _get_mair_tasks_by_domain('Medical')] + tasks=["MAIR-" + name for name in _get_mair_tasks_by_domain("Medical")] ), description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", reference="https://github.com/sunnweiwei/MAIR", @@ -1006,7 +1004,7 @@ def _get_mair_all_tasks(): MAIR_FINANCE = Benchmark( name="MAIR(Finance)", tasks=get_tasks( - tasks=['MAIR-' + name for name in _get_mair_tasks_by_domain('Finance')] + tasks=["MAIR-" + name for name in _get_mair_tasks_by_domain("Finance")] ), description="MAIR: A Massive Benchmark for Evaluating Instructed Retrieval", reference="https://github.com/sunnweiwei/MAIR", From 1842e47b060d99dba3885f3f7bd3a03cae6beb4c Mon Sep 17 00:00:00 2001 From: Weiwei Sun <68775773+sunnweiwei@users.noreply.github.com> Date: Sun, 10 Nov 2024 02:31:23 -0500 Subject: [PATCH 09/11] Update benchmarks.py --- mteb/benchmarks/benchmarks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index a14dbb586..99e391c24 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -921,7 +921,10 @@ def load_results( ) -MAIR_TASK_CONFIG = json.loads('{"Competition-Math": "Academic", "ProofWiki_Proof": "Academic", "ProofWiki_Reference": "Academic", "Stacks_Proof": "Academic", "Stacks_Reference": "Academic", "Stein_Proof": "Academic", "Stein_Reference": "Academic", "Trench_Proof": "Academic", "Trench_Reference": "Academic", "TAD": "Academic", "TAS2": "Academic", "StackMathQA": "Academic", "APPS": "Code", "CodeEditSearch": "Code", "CodeSearchNet": "Code", "Conala": "Code", "HumanEval-X": "Code", "LeetCode": "Code", "MBPP": "Code", "RepoBench": "Code", "TLDR": "Code", "SWE-Bench-Lite": "Code", "Apple": "Finance", "ConvFinQA": "Finance", "FinQA": "Finance", "FinanceBench": "Finance", "HC3Finance": "Finance", "TAT-DQA": "Finance", "Trade-the-event": "Finance", "AY2": "Web", "ELI5": "Web", "Fever": "Web", "TREx": "Web", "WnCw": "Web", "WnWi": "Web", "WoW": "Web", "zsRE": "Web", "AILA2019-Case": "Legal", "AILA2019-Statutes": "Legal", "BSARD": "Legal", "BillSum": "Legal", "CUAD": "Legal", "GerDaLIR": "Legal", "LeCaRDv2": "Legal", "LegalQuAD": "Legal", "REGIR-EU2UK": "Legal", "REGIR-UK2EU": "Legal", "ArguAna": "Web", "CQADupStack": "Web", "FiQA": "Finance", "NFCorpus": "Medical", "Quora": "Web", "SciDocs": "Academic", "SciFact": "Academic", "TopiOCQA": "Web", "Touche": "Web", "Trec-Covid": "Medical", "ACORDAR": "Web", "CPCD": "Web", "ChroniclingAmericaQA": "Web", "Monant": "Medical", "NTCIR": "Web", "PointRec": "Web", "ProCIS-Dialog": "Web", "ProCIS-Turn": "Web", "QuanTemp": "Web", "WebTableSearch": "Web", "CARE": "Medical", "MISeD": "Web", "SParC": "Web", "SParC-SQL": "Web", "Spider": "Web", "Spider-SQL": "Web", "LitSearch": "Academic", "CAsT_2019": "Web", "CAsT_2020": "Web", "CAsT_2021": "Web", "CAsT_2022": "Web", "Core_2017": "Web", "Microblog_2011": "Web", "Microblog_2012": "Web", "Microblog_2013": "Web", "Microblog_2014": "Web", "PrecisionMedicine_2017": "Medical", "PrecisionMedicine_2018": "Medical", "PrecisionMedicine_2019": "Medical", "PrecisionMedicine-Article_2019": "Medical", "PrecisionMedicine-Article_2020": "Medical", "CliniDS_2014": "Medical", "CliniDS_2015": "Medical", "CliniDS_2016": "Medical", "ClinicalTrials_2021": "Medical", "ClinicalTrials_2022": "Medical", "ClinicalTrials_2023": "Medical", "DD_2015": "Web", "DD_2016": "Web", "DD_2017": "Web", "FairRanking_2020": "Academic", "FairRanking_2021": "Web", "FairRanking_2022": "Web", "Genomics-AdHoc_2004": "Medical", "Genomics-AdHoc_2005": "Medical", "Genomics-AdHoc_2006": "Medical", "Genomics-AdHoc_2007": "Medical", "TREC-Legal_2011": "Legal", "NeuCLIR-Tech_2023": "Web", "NeuCLIR_2022": "Web", "NeuCLIR_2023": "Web", "ProductSearch_2023": "Web", "ToT_2023": "Web", "ToT_2024": "Web", "FoodAPI": "Code", "HuggingfaceAPI": "Code", "PytorchAPI": "Code", "SpotifyAPI": "Code", "TMDB": "Code", "TensorAPI": "Code", "ToolBench": "Code", "WeatherAPI": "Code", "ExcluIR": "Web", "Core17": "Web", "News21": "Web", "Robust04": "Web", "InstructIR": "Web", "NevIR": "Web", "IFEval": "Web"}') +MAIR_TASK_CONFIG = json.loads( + '{"Competition-Math": "Academic", "ProofWiki_Proof": "Academic", "ProofWiki_Reference": "Academic", "Stacks_Proof": "Academic", "Stacks_Reference": "Academic", "Stein_Proof": "Academic", "Stein_Reference": "Academic", "Trench_Proof": "Academic", "Trench_Reference": "Academic", "TAD": "Academic", "TAS2": "Academic", "StackMathQA": "Academic", "APPS": "Code", "CodeEditSearch": "Code", "CodeSearchNet": "Code", "Conala": "Code", "HumanEval-X": "Code", "LeetCode": "Code", "MBPP": "Code", "RepoBench": "Code", "TLDR": "Code", "SWE-Bench-Lite": "Code", "Apple": "Finance", "ConvFinQA": "Finance", "FinQA": "Finance", "FinanceBench": "Finance", "HC3Finance": "Finance", "TAT-DQA": "Finance", "Trade-the-event": "Finance", "AY2": "Web", "ELI5": "Web", "Fever": "Web", "TREx": "Web", "WnCw": "Web", "WnWi": "Web", "WoW": "Web", "zsRE": "Web", "AILA2019-Case": "Legal", "AILA2019-Statutes": "Legal", "BSARD": "Legal", "BillSum": "Legal", "CUAD": "Legal", "GerDaLIR": "Legal", "LeCaRDv2": "Legal", "LegalQuAD": "Legal", "REGIR-EU2UK": "Legal", "REGIR-UK2EU": "Legal", "ArguAna": "Web", "CQADupStack": "Web", "FiQA": "Finance", "NFCorpus": "Medical", "Quora": "Web", "SciDocs": "Academic", "SciFact": "Academic", "TopiOCQA": "Web", "Touche": "Web", "Trec-Covid": "Medical", "ACORDAR": "Web", "CPCD": "Web", "ChroniclingAmericaQA": "Web", "Monant": "Medical", "NTCIR": "Web", "PointRec": "Web", "ProCIS-Dialog": "Web", "ProCIS-Turn": "Web", "QuanTemp": "Web", "WebTableSearch": "Web", "CARE": "Medical", "MISeD": "Web", "SParC": "Web", "SParC-SQL": "Web", "Spider": "Web", "Spider-SQL": "Web", "LitSearch": "Academic", "CAsT_2019": "Web", "CAsT_2020": "Web", "CAsT_2021": "Web", "CAsT_2022": "Web", "Core_2017": "Web", "Microblog_2011": "Web", "Microblog_2012": "Web", "Microblog_2013": "Web", "Microblog_2014": "Web", "PrecisionMedicine_2017": "Medical", "PrecisionMedicine_2018": "Medical", "PrecisionMedicine_2019": "Medical", "PrecisionMedicine-Article_2019": "Medical", "PrecisionMedicine-Article_2020": "Medical", "CliniDS_2014": "Medical", "CliniDS_2015": "Medical", "CliniDS_2016": "Medical", "ClinicalTrials_2021": "Medical", "ClinicalTrials_2022": "Medical", "ClinicalTrials_2023": "Medical", "DD_2015": "Web", "DD_2016": "Web", "DD_2017": "Web", "FairRanking_2020": "Academic", "FairRanking_2021": "Web", "FairRanking_2022": "Web", "Genomics-AdHoc_2004": "Medical", "Genomics-AdHoc_2005": "Medical", "Genomics-AdHoc_2006": "Medical", "Genomics-AdHoc_2007": "Medical", "TREC-Legal_2011": "Legal", "NeuCLIR-Tech_2023": "Web", "NeuCLIR_2022": "Web", "NeuCLIR_2023": "Web", "ProductSearch_2023": "Web", "ToT_2023": "Web", "ToT_2024": "Web", "FoodAPI": "Code", "HuggingfaceAPI": "Code", "PytorchAPI": "Code", "SpotifyAPI": "Code", "TMDB": "Code", "TensorAPI": "Code", "ToolBench": "Code", "WeatherAPI": "Code", "ExcluIR": "Web", "Core17": "Web", "News21": "Web", "Robust04": "Web", "InstructIR": "Web", "NevIR": "Web", "IFEval": "Web"}' +) + def _get_mair_tasks_by_domain(domain): assert domain in ["Academic", "Code", "Web", "Legal", "Medical", "Finance"] From 7f003f069c41b15edd49ab7658c2179aca650bd1 Mon Sep 17 00:00:00 2001 From: Weiwei Sun <68775773+sunnweiwei@users.noreply.github.com> Date: Sun, 10 Nov 2024 02:31:42 -0500 Subject: [PATCH 10/11] Update MAIR.py --- mteb/tasks/MAIR/eng/MAIR.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mteb/tasks/MAIR/eng/MAIR.py b/mteb/tasks/MAIR/eng/MAIR.py index 55c50018f..46a937132 100644 --- a/mteb/tasks/MAIR/eng/MAIR.py +++ b/mteb/tasks/MAIR/eng/MAIR.py @@ -1,16 +1,22 @@ from __future__ import annotations -import datasets import json + +import datasets + from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval from mteb.abstasks.TaskMetadata import TaskMetadata __all__ = [] -TASK2SPLIT = json.loads('{"Competition-Math": ["queries"], "ProofWiki_Proof": ["queries"], "ProofWiki_Reference": ["queries"], "Stacks_Proof": ["queries"], "Stacks_Reference": ["queries"], "Stein_Proof": ["queries"], "Stein_Reference": ["queries"], "Trench_Proof": ["queries"], "Trench_Reference": ["queries"], "TAD": ["queries"], "TAS2": ["queries"], "StackMathQA": ["queries"], "APPS": ["queries"], "CodeEditSearch": ["queries"], "CodeSearchNet": ["queries"], "Conala": ["queries"], "HumanEval-X": ["queries"], "LeetCode": ["queries"], "MBPP": ["queries"], "RepoBench": ["queries"], "TLDR": ["queries"], "SWE-Bench-Lite": ["astropy__astropy_12544_queries", "astropy__astropy_13158_queries", "astropy__astropy_13162_queries", "astropy__astropy_13398_queries", "astropy__astropy_13438_queries", "astropy__astropy_14439_queries", "astropy__astropy_14701_queries", "astropy__astropy_14966_queries", "astropy__astropy_7441_queries", "astropy__astropy_8707_queries", "django__django_11501_queries", "django__django_12091_queries", "django__django_13192_queries", "django__django_13218_queries", "django__django_13884_queries", "django__django_14441_queries", "django__django_15481_queries", "django__django_15869_queries", "django__django_16901_queries", "django__django_17065_queries", "matplotlib__matplotlib_20518_queries", "matplotlib__matplotlib_23314_queries", "matplotlib__matplotlib_23913_queries", "matplotlib__matplotlib_24627_queries", "matplotlib__matplotlib_24849_queries", "matplotlib__matplotlib_25027_queries", "matplotlib__matplotlib_25238_queries", "matplotlib__matplotlib_25404_queries", "matplotlib__matplotlib_25430_queries", "matplotlib__matplotlib_25746_queries", "mwaskom__seaborn_2389_queries", "mwaskom__seaborn_2576_queries", "mwaskom__seaborn_2766_queries", "mwaskom__seaborn_2813_queries", "mwaskom__seaborn_2853_queries", "mwaskom__seaborn_2946_queries", "mwaskom__seaborn_2979_queries", "mwaskom__seaborn_2996_queries", "mwaskom__seaborn_3202_queries", "mwaskom__seaborn_3407_queries", "pallets__flask_4045_queries", "pallets__flask_4074_queries", "pallets__flask_4160_queries", "pallets__flask_4169_queries", "pallets__flask_4544_queries", "pallets__flask_4575_queries", "pallets__flask_4642_queries", "pallets__flask_4992_queries", "pallets__flask_5014_queries", "pallets__flask_5063_queries", "psf__requests_1537_queries", "psf__requests_1713_queries", "psf__requests_1733_queries", "psf__requests_1766_queries", "psf__requests_2193_queries", "psf__requests_2466_queries", "psf__requests_2821_queries", "psf__requests_3362_queries", "psf__requests_5414_queries", "psf__requests_863_queries", "pydata__xarray_4339_queries", "pydata__xarray_4767_queries", "pydata__xarray_4827_queries", "pydata__xarray_4911_queries", "pydata__xarray_4966_queries", "pydata__xarray_5033_queries", "pydata__xarray_5682_queries", "pydata__xarray_6135_queries", "pydata__xarray_6461_queries", "pydata__xarray_7391_queries", "pylint_dev__pylint_4398_queries", "pylint_dev__pylint_4604_queries", "pylint_dev__pylint_5175_queries", "pylint_dev__pylint_5446_queries", "pylint_dev__pylint_5613_queries", "pylint_dev__pylint_6358_queries", "pylint_dev__pylint_6412_queries", "pylint_dev__pylint_6556_queries", "pylint_dev__pylint_8281_queries", "pylint_dev__pylint_8757_queries", "pytest_dev__pytest_10371_queries", "pytest_dev__pytest_11047_queries", "pytest_dev__pytest_11148_queries", "pytest_dev__pytest_5356_queries", "pytest_dev__pytest_6680_queries", "pytest_dev__pytest_7158_queries", "pytest_dev__pytest_7352_queries", "pytest_dev__pytest_9064_queries", "pytest_dev__pytest_9279_queries", "scikit_learn__scikit_learn_10198_queries", "scikit_learn__scikit_learn_10803_queries", "scikit_learn__scikit_learn_10949_queries", "scikit_learn__scikit_learn_11333_queries", "scikit_learn__scikit_learn_11635_queries", "scikit_learn__scikit_learn_12827_queries", "scikit_learn__scikit_learn_12834_queries", "scikit_learn__scikit_learn_13302_queries", "scikit_learn__scikit_learn_13392_queries", "scikit_learn__scikit_learn_13779_queries", "sphinx_doc__sphinx_11312_queries", "sphinx_doc__sphinx_11502_queries", "sphinx_doc__sphinx_7356_queries", "sphinx_doc__sphinx_7590_queries", "sphinx_doc__sphinx_7757_queries", "sphinx_doc__sphinx_7831_queries", "sphinx_doc__sphinx_8125_queries", "sphinx_doc__sphinx_8863_queries", "sphinx_doc__sphinx_9309_queries", "sphinx_doc__sphinx_9828_queries", "sympy__sympy_13091_queries", "sympy__sympy_14817_queries", "sympy__sympy_14821_queries", "sympy__sympy_15151_queries", "sympy__sympy_15933_queries", "sympy__sympy_16493_queries", "sympy__sympy_16858_queries", "sympy__sympy_17251_queries", "sympy__sympy_18532_queries", "sympy__sympy_20212_queries"], "Apple": ["queries"], "ConvFinQA": ["queries"], "FinQA": ["queries"], "FinanceBench": ["queries"], "HC3Finance": ["queries"], "TAT-DQA": ["queries"], "Trade-the-event": ["queries"], "AY2": ["queries"], "ELI5": ["queries"], "Fever": ["queries"], "TREx": ["queries"], "WnCw": ["queries"], "WnWi": ["queries"], "WoW": ["queries"], "zsRE": ["queries"], "AILA2019-Case": ["queries"], "AILA2019-Statutes": ["queries"], "BSARD": ["queries"], "BillSum": ["queries"], "CUAD": ["GOOSEHEADINSURANCE_queries", "GRANTIERRAENERGY_queries", "HarpoonTherapeutics_queries", "Monsanto_Company_queries"], "GerDaLIR": ["queries"], "LeCaRDv2": ["queries"], "LegalQuAD": ["queries"], "REGIR-EU2UK": ["queries"], "REGIR-UK2EU": ["queries"], "ArguAna": ["queries"], "CQADupStack": ["CQADupStack_Android_queries", "CQADupStack_English_queries", "CQADupStack_Gaming_queries", "CQADupStack_Gis_queries", "CQADupStack_Math_queries", "CQADupStack_Physics_queries", "CQADupStack_Programmers_queries", "CQADupStack_Stats_queries", "CQADupStack_Tex_queries", "CQADupStack_Unix_queries", "CQADupStack_WebMasters_queries", "CQADupStack_Wordpress_queries"], "FiQA": ["queries"], "NFCorpus": ["queries"], "Quora": ["queries"], "SciDocs": ["queries"], "SciFact": ["queries"], "TopiOCQA": ["queries"], "Touche": ["queries"], "Trec-Covid": ["queries"], "ACORDAR": ["queries"], "CPCD": ["queries"], "ChroniclingAmericaQA": ["queries"], "Monant": ["queries"], "NTCIR": ["queries"], "PointRec": ["queries"], "ProCIS-Dialog": ["queries"], "ProCIS-Turn": ["queries"], "QuanTemp": ["queries"], "WebTableSearch": ["queries"], "CARE": ["queries"], "MISeD": ["Bmr006_queries", "Bro027_queries", "covid4_queries", "covid9_queries", "education4_queries"], "SParC": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "SParC-SQL": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "Spider": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "Spider-SQL": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "LitSearch": ["queries"], "CAsT_2019": ["queries"], "CAsT_2020": ["queries"], "CAsT_2021": ["queries"], "CAsT_2022": ["queries"], "Core_2017": ["queries"], "Microblog_2011": ["queries"], "Microblog_2012": ["queries"], "Microblog_2013": ["queries"], "Microblog_2014": ["queries"], "PrecisionMedicine_2017": ["queries"], "PrecisionMedicine_2018": ["queries"], "PrecisionMedicine_2019": ["queries"], "PrecisionMedicine-Article_2019": ["queries"], "PrecisionMedicine-Article_2020": ["queries"], "CliniDS_2014": ["queries"], "CliniDS_2015": ["queries"], "CliniDS_2016": ["queries"], "ClinicalTrials_2021": ["queries"], "ClinicalTrials_2022": ["queries"], "ClinicalTrials_2023": ["queries"], "DD_2015": ["queries"], "DD_2016": ["queries"], "DD_2017": ["queries"], "FairRanking_2020": ["queries"], "FairRanking_2021": ["queries"], "FairRanking_2022": ["queries"], "Genomics-AdHoc_2004": ["queries"], "Genomics-AdHoc_2005": ["queries"], "Genomics-AdHoc_2006": ["queries"], "Genomics-AdHoc_2007": ["queries"], "TREC-Legal_2011": ["queries"], "NeuCLIR-Tech_2023": ["queries"], "NeuCLIR_2022": ["queries"], "NeuCLIR_2023": ["queries"], "ProductSearch_2023": ["queries"], "ToT_2023": ["queries"], "ToT_2024": ["queries"], "FoodAPI": ["queries"], "HuggingfaceAPI": ["queries"], "PytorchAPI": ["queries"], "SpotifyAPI": ["queries"], "TMDB": ["queries"], "TensorAPI": ["queries"], "ToolBench": ["queries"], "WeatherAPI": ["queries"], "ExcluIR": ["queries"], "Core17": ["queries"], "News21": ["queries"], "Robust04": ["queries"], "InstructIR": ["queries"], "NevIR": ["queries"], "IFEval": ["detectable_format__number_bullet_lists_2078_queries", "detectable_format__number_bullet_lists_102_queries", "detectable_format__number_bullet_lists_2195_queries", "detectable_format__number_bullet_lists_2314_queries", "detectable_format__number_bullet_lists_1934_queries", "detectable_format__number_bullet_lists_2667_queries", "detectable_format__number_bullet_lists_1634_queries", "detectable_format__number_bullet_lists_2100_queries", "detectable_format__number_bullet_lists_1286_queries", "detectable_format__number_bullet_lists_2457_queries", "keywords__letter_frequency_1130_queries", "keywords__letter_frequency_2107_queries", "keywords__letter_frequency_1964_queries", "keywords__letter_frequency_2265_queries", "detectable_format__constrained_response_3752_queries", "detectable_format__constrained_response_3755_queries", "detectable_format__constrained_response_3754_queries", "detectable_format__constrained_response_3753_queries", "detectable_format__constrained_response_227_queries", "detectable_format__constrained_response_3749_queries", "detectable_format__constrained_response_3756_queries", "detectable_format__constrained_response_3751_queries", "detectable_format__constrained_response_3750_queries", "detectable_format__constrained_response_3757_queries", "punctuation__no_comma_2245_queries", "punctuation__no_comma_1107_queries", "punctuation__no_comma_1162_queries", "punctuation__no_comma_1418_queries", "punctuation__no_comma_1001_queries", "punctuation__no_comma_1187_queries", "punctuation__no_comma_1738_queries", "punctuation__no_comma_1300_queries", "punctuation__no_comma_2069_queries", "punctuation__no_comma_1643_queries", "keywords__existence_3156_queries", "keywords__existence_2485_queries", "keywords__existence_1531_queries", "keywords__existence_3732_queries", "keywords__existence_2662_queries", "change_case__english_capital_2341_queries", "change_case__english_capital_3186_queries", "change_case__english_capital_2563_queries", "change_case__english_capital_1999_queries", "change_case__english_capital_24_queries", "change_case__english_capital_1645_queries", "change_case__english_lowercase_1122_queries", "change_case__english_lowercase_1361_queries", "change_case__english_lowercase_1019_queries", "change_case__english_lowercase_1087_queries", "change_case__english_lowercase_1667_queries", "change_case__english_lowercase_1516_queries", "change_case__english_lowercase_1535_queries", "change_case__english_lowercase_1593_queries", "change_case__english_lowercase_1843_queries", "keywords__frequency_1393_queries", "keywords__frequency_1733_queries", "keywords__frequency_2142_queries", "keywords__frequency_2292_queries", "keywords__frequency_1498_queries", "keywords__frequency_1203_queries", "keywords__frequency_1857_queries", "length_constraints__number_sentences_1837_queries", "length_constraints__number_sentences_2674_queries", "length_constraints__number_sentences_2617_queries", "length_constraints__number_sentences_1381_queries", "length_constraints__number_sentences_2266_queries", "length_constraints__number_sentences_1268_queries", "length_constraints__number_sentences_179_queries", "length_constraints__number_paragraphs_1236_queries", "length_constraints__number_paragraphs_2941_queries", "length_constraints__number_paragraphs_1248_queries", "length_constraints__number_paragraphs_1858_queries", "length_constraints__number_paragraphs_1377_queries", "length_constraints__number_paragraphs_2357_queries", "length_constraints__number_paragraphs_2921_queries", "length_constraints__number_paragraphs_1082_queries", "length_constraints__number_paragraphs_2467_queries", "combination__two_responses_1591_queries", "combination__two_responses_1793_queries", "combination__two_responses_2912_queries", "combination__two_responses_1332_queries", "combination__two_responses_2383_queries", "combination__two_responses_136_queries", "combination__two_responses_1098_queries", "combination__two_responses_1746_queries", "combination__two_responses_247_queries", "combination__two_responses_2918_queries", "detectable_content__postscript_2273_queries", "detectable_content__postscript_2070_queries", "detectable_content__postscript_1800_queries", "detectable_content__postscript_1305_queries", "detectable_content__postscript_1759_queries", "detectable_content__postscript_1367_queries", "detectable_content__postscript_1537_queries", "detectable_content__postscript_1879_queries", "detectable_content__postscript_1246_queries", "detectable_content__postscript_1620_queries", "startend__end_checker_2398_queries", "startend__end_checker_1902_queries", "startend__end_checker_2268_queries", "startend__end_checker_1659_queries", "startend__end_checker_1893_queries", "startend__end_checker_2475_queries", "startend__end_checker_1128_queries", "startend__end_checker_1939_queries", "startend__end_checker_1446_queries", "startend__end_checker_1220_queries", "detectable_content__number_placeholders_3280_queries", "detectable_content__number_placeholders_1372_queries", "detectable_content__number_placeholders_3221_queries", "detectable_content__number_placeholders_1927_queries", "detectable_content__number_placeholders_3126_queries", "detectable_content__number_placeholders_2164_queries", "detectable_content__number_placeholders_2136_queries", "detectable_content__number_placeholders_2304_queries", "detectable_content__number_placeholders_3743_queries", "length_constraints__number_words_2323_queries", "length_constraints__number_words_1072_queries", "length_constraints__number_words_1258_queries", "length_constraints__number_words_1251_queries", "length_constraints__number_words_164_queries", "detectable_format__number_highlighted_sections_168_queries", "detectable_format__number_highlighted_sections_1237_queries", "detectable_format__number_highlighted_sections_1601_queries", "detectable_format__number_highlighted_sections_167_queries", "detectable_format__number_highlighted_sections_1773_queries", "detectable_format__number_highlighted_sections_1646_queries", "detectable_format__number_highlighted_sections_1379_queries", "detectable_format__number_highlighted_sections_1307_queries", "detectable_format__number_highlighted_sections_1886_queries", "detectable_format__number_highlighted_sections_1644_queries", "detectable_format__json_format_1094_queries", "detectable_format__json_format_1148_queries", "detectable_format__json_format_1137_queries", "detectable_format__json_format_1075_queries", "detectable_format__json_format_2857_queries", "detectable_format__json_format_3223_queries", "detectable_format__json_format_2404_queries", "detectable_format__json_format_321_queries", "detectable_format__json_format_13_queries", "change_case__capital_word_frequency_2820_queries", "change_case__capital_word_frequency_2849_queries", "change_case__capital_word_frequency_2870_queries", "change_case__capital_word_frequency_1592_queries", "detectable_format__multiple_sections_2023_queries", "detectable_format__multiple_sections_1548_queries", "detectable_format__multiple_sections_2925_queries", "detectable_format__multiple_sections_1131_queries", "detectable_format__multiple_sections_357_queries", "startend__quotation_2015_queries", "startend__quotation_219_queries", "startend__quotation_2010_queries", "startend__quotation_1658_queries", "startend__quotation_1325_queries", "startend__quotation_1776_queries", "startend__quotation_2239_queries", "startend__quotation_1845_queries", "startend__quotation_2209_queries", "length_constraints__nth_paragraph_first_word_2880_queries", "length_constraints__nth_paragraph_first_word_181_queries", "length_constraints__nth_paragraph_first_word_2250_queries", "length_constraints__nth_paragraph_first_word_2215_queries", "length_constraints__nth_paragraph_first_word_3073_queries", "length_constraints__nth_paragraph_first_word_2590_queries", "length_constraints__nth_paragraph_first_word_3624_queries", "length_constraints__nth_paragraph_first_word_1954_queries", "detectable_format__title_1262_queries", "detectable_format__title_2229_queries", "detectable_format__title_295_queries", "detectable_format__title_2097_queries", "detectable_format__title_1802_queries", "detectable_format__title_1322_queries", "detectable_format__title_2969_queries", "detectable_format__title_3057_queries", "detectable_format__title_1551_queries", "detectable_format__title_2807_queries"]}') +TASK2SPLIT = json.loads( + '{"Competition-Math": ["queries"], "ProofWiki_Proof": ["queries"], "ProofWiki_Reference": ["queries"], "Stacks_Proof": ["queries"], "Stacks_Reference": ["queries"], "Stein_Proof": ["queries"], "Stein_Reference": ["queries"], "Trench_Proof": ["queries"], "Trench_Reference": ["queries"], "TAD": ["queries"], "TAS2": ["queries"], "StackMathQA": ["queries"], "APPS": ["queries"], "CodeEditSearch": ["queries"], "CodeSearchNet": ["queries"], "Conala": ["queries"], "HumanEval-X": ["queries"], "LeetCode": ["queries"], "MBPP": ["queries"], "RepoBench": ["queries"], "TLDR": ["queries"], "SWE-Bench-Lite": ["astropy__astropy_12544_queries", "astropy__astropy_13158_queries", "astropy__astropy_13162_queries", "astropy__astropy_13398_queries", "astropy__astropy_13438_queries", "astropy__astropy_14439_queries", "astropy__astropy_14701_queries", "astropy__astropy_14966_queries", "astropy__astropy_7441_queries", "astropy__astropy_8707_queries", "django__django_11501_queries", "django__django_12091_queries", "django__django_13192_queries", "django__django_13218_queries", "django__django_13884_queries", "django__django_14441_queries", "django__django_15481_queries", "django__django_15869_queries", "django__django_16901_queries", "django__django_17065_queries", "matplotlib__matplotlib_20518_queries", "matplotlib__matplotlib_23314_queries", "matplotlib__matplotlib_23913_queries", "matplotlib__matplotlib_24627_queries", "matplotlib__matplotlib_24849_queries", "matplotlib__matplotlib_25027_queries", "matplotlib__matplotlib_25238_queries", "matplotlib__matplotlib_25404_queries", "matplotlib__matplotlib_25430_queries", "matplotlib__matplotlib_25746_queries", "mwaskom__seaborn_2389_queries", "mwaskom__seaborn_2576_queries", "mwaskom__seaborn_2766_queries", "mwaskom__seaborn_2813_queries", "mwaskom__seaborn_2853_queries", "mwaskom__seaborn_2946_queries", "mwaskom__seaborn_2979_queries", "mwaskom__seaborn_2996_queries", "mwaskom__seaborn_3202_queries", "mwaskom__seaborn_3407_queries", "pallets__flask_4045_queries", "pallets__flask_4074_queries", "pallets__flask_4160_queries", "pallets__flask_4169_queries", "pallets__flask_4544_queries", "pallets__flask_4575_queries", "pallets__flask_4642_queries", "pallets__flask_4992_queries", "pallets__flask_5014_queries", "pallets__flask_5063_queries", "psf__requests_1537_queries", "psf__requests_1713_queries", "psf__requests_1733_queries", "psf__requests_1766_queries", "psf__requests_2193_queries", "psf__requests_2466_queries", "psf__requests_2821_queries", "psf__requests_3362_queries", "psf__requests_5414_queries", "psf__requests_863_queries", "pydata__xarray_4339_queries", "pydata__xarray_4767_queries", "pydata__xarray_4827_queries", "pydata__xarray_4911_queries", "pydata__xarray_4966_queries", "pydata__xarray_5033_queries", "pydata__xarray_5682_queries", "pydata__xarray_6135_queries", "pydata__xarray_6461_queries", "pydata__xarray_7391_queries", "pylint_dev__pylint_4398_queries", "pylint_dev__pylint_4604_queries", "pylint_dev__pylint_5175_queries", "pylint_dev__pylint_5446_queries", "pylint_dev__pylint_5613_queries", "pylint_dev__pylint_6358_queries", "pylint_dev__pylint_6412_queries", "pylint_dev__pylint_6556_queries", "pylint_dev__pylint_8281_queries", "pylint_dev__pylint_8757_queries", "pytest_dev__pytest_10371_queries", "pytest_dev__pytest_11047_queries", "pytest_dev__pytest_11148_queries", "pytest_dev__pytest_5356_queries", "pytest_dev__pytest_6680_queries", "pytest_dev__pytest_7158_queries", "pytest_dev__pytest_7352_queries", "pytest_dev__pytest_9064_queries", "pytest_dev__pytest_9279_queries", "scikit_learn__scikit_learn_10198_queries", "scikit_learn__scikit_learn_10803_queries", "scikit_learn__scikit_learn_10949_queries", "scikit_learn__scikit_learn_11333_queries", "scikit_learn__scikit_learn_11635_queries", "scikit_learn__scikit_learn_12827_queries", "scikit_learn__scikit_learn_12834_queries", "scikit_learn__scikit_learn_13302_queries", "scikit_learn__scikit_learn_13392_queries", "scikit_learn__scikit_learn_13779_queries", "sphinx_doc__sphinx_11312_queries", "sphinx_doc__sphinx_11502_queries", "sphinx_doc__sphinx_7356_queries", "sphinx_doc__sphinx_7590_queries", "sphinx_doc__sphinx_7757_queries", "sphinx_doc__sphinx_7831_queries", "sphinx_doc__sphinx_8125_queries", "sphinx_doc__sphinx_8863_queries", "sphinx_doc__sphinx_9309_queries", "sphinx_doc__sphinx_9828_queries", "sympy__sympy_13091_queries", "sympy__sympy_14817_queries", "sympy__sympy_14821_queries", "sympy__sympy_15151_queries", "sympy__sympy_15933_queries", "sympy__sympy_16493_queries", "sympy__sympy_16858_queries", "sympy__sympy_17251_queries", "sympy__sympy_18532_queries", "sympy__sympy_20212_queries"], "Apple": ["queries"], "ConvFinQA": ["queries"], "FinQA": ["queries"], "FinanceBench": ["queries"], "HC3Finance": ["queries"], "TAT-DQA": ["queries"], "Trade-the-event": ["queries"], "AY2": ["queries"], "ELI5": ["queries"], "Fever": ["queries"], "TREx": ["queries"], "WnCw": ["queries"], "WnWi": ["queries"], "WoW": ["queries"], "zsRE": ["queries"], "AILA2019-Case": ["queries"], "AILA2019-Statutes": ["queries"], "BSARD": ["queries"], "BillSum": ["queries"], "CUAD": ["GOOSEHEADINSURANCE_queries", "GRANTIERRAENERGY_queries", "HarpoonTherapeutics_queries", "Monsanto_Company_queries"], "GerDaLIR": ["queries"], "LeCaRDv2": ["queries"], "LegalQuAD": ["queries"], "REGIR-EU2UK": ["queries"], "REGIR-UK2EU": ["queries"], "ArguAna": ["queries"], "CQADupStack": ["CQADupStack_Android_queries", "CQADupStack_English_queries", "CQADupStack_Gaming_queries", "CQADupStack_Gis_queries", "CQADupStack_Math_queries", "CQADupStack_Physics_queries", "CQADupStack_Programmers_queries", "CQADupStack_Stats_queries", "CQADupStack_Tex_queries", "CQADupStack_Unix_queries", "CQADupStack_WebMasters_queries", "CQADupStack_Wordpress_queries"], "FiQA": ["queries"], "NFCorpus": ["queries"], "Quora": ["queries"], "SciDocs": ["queries"], "SciFact": ["queries"], "TopiOCQA": ["queries"], "Touche": ["queries"], "Trec-Covid": ["queries"], "ACORDAR": ["queries"], "CPCD": ["queries"], "ChroniclingAmericaQA": ["queries"], "Monant": ["queries"], "NTCIR": ["queries"], "PointRec": ["queries"], "ProCIS-Dialog": ["queries"], "ProCIS-Turn": ["queries"], "QuanTemp": ["queries"], "WebTableSearch": ["queries"], "CARE": ["queries"], "MISeD": ["Bmr006_queries", "Bro027_queries", "covid4_queries", "covid9_queries", "education4_queries"], "SParC": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "SParC-SQL": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "Spider": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "Spider-SQL": ["chinook_1_queries", "college_2_queries", "store_1_queries"], "LitSearch": ["queries"], "CAsT_2019": ["queries"], "CAsT_2020": ["queries"], "CAsT_2021": ["queries"], "CAsT_2022": ["queries"], "Core_2017": ["queries"], "Microblog_2011": ["queries"], "Microblog_2012": ["queries"], "Microblog_2013": ["queries"], "Microblog_2014": ["queries"], "PrecisionMedicine_2017": ["queries"], "PrecisionMedicine_2018": ["queries"], "PrecisionMedicine_2019": ["queries"], "PrecisionMedicine-Article_2019": ["queries"], "PrecisionMedicine-Article_2020": ["queries"], "CliniDS_2014": ["queries"], "CliniDS_2015": ["queries"], "CliniDS_2016": ["queries"], "ClinicalTrials_2021": ["queries"], "ClinicalTrials_2022": ["queries"], "ClinicalTrials_2023": ["queries"], "DD_2015": ["queries"], "DD_2016": ["queries"], "DD_2017": ["queries"], "FairRanking_2020": ["queries"], "FairRanking_2021": ["queries"], "FairRanking_2022": ["queries"], "Genomics-AdHoc_2004": ["queries"], "Genomics-AdHoc_2005": ["queries"], "Genomics-AdHoc_2006": ["queries"], "Genomics-AdHoc_2007": ["queries"], "TREC-Legal_2011": ["queries"], "NeuCLIR-Tech_2023": ["queries"], "NeuCLIR_2022": ["queries"], "NeuCLIR_2023": ["queries"], "ProductSearch_2023": ["queries"], "ToT_2023": ["queries"], "ToT_2024": ["queries"], "FoodAPI": ["queries"], "HuggingfaceAPI": ["queries"], "PytorchAPI": ["queries"], "SpotifyAPI": ["queries"], "TMDB": ["queries"], "TensorAPI": ["queries"], "ToolBench": ["queries"], "WeatherAPI": ["queries"], "ExcluIR": ["queries"], "Core17": ["queries"], "News21": ["queries"], "Robust04": ["queries"], "InstructIR": ["queries"], "NevIR": ["queries"], "IFEval": ["detectable_format__number_bullet_lists_2078_queries", "detectable_format__number_bullet_lists_102_queries", "detectable_format__number_bullet_lists_2195_queries", "detectable_format__number_bullet_lists_2314_queries", "detectable_format__number_bullet_lists_1934_queries", "detectable_format__number_bullet_lists_2667_queries", "detectable_format__number_bullet_lists_1634_queries", "detectable_format__number_bullet_lists_2100_queries", "detectable_format__number_bullet_lists_1286_queries", "detectable_format__number_bullet_lists_2457_queries", "keywords__letter_frequency_1130_queries", "keywords__letter_frequency_2107_queries", "keywords__letter_frequency_1964_queries", "keywords__letter_frequency_2265_queries", "detectable_format__constrained_response_3752_queries", "detectable_format__constrained_response_3755_queries", "detectable_format__constrained_response_3754_queries", "detectable_format__constrained_response_3753_queries", "detectable_format__constrained_response_227_queries", "detectable_format__constrained_response_3749_queries", "detectable_format__constrained_response_3756_queries", "detectable_format__constrained_response_3751_queries", "detectable_format__constrained_response_3750_queries", "detectable_format__constrained_response_3757_queries", "punctuation__no_comma_2245_queries", "punctuation__no_comma_1107_queries", "punctuation__no_comma_1162_queries", "punctuation__no_comma_1418_queries", "punctuation__no_comma_1001_queries", "punctuation__no_comma_1187_queries", "punctuation__no_comma_1738_queries", "punctuation__no_comma_1300_queries", "punctuation__no_comma_2069_queries", "punctuation__no_comma_1643_queries", "keywords__existence_3156_queries", "keywords__existence_2485_queries", "keywords__existence_1531_queries", "keywords__existence_3732_queries", "keywords__existence_2662_queries", "change_case__english_capital_2341_queries", "change_case__english_capital_3186_queries", "change_case__english_capital_2563_queries", "change_case__english_capital_1999_queries", "change_case__english_capital_24_queries", "change_case__english_capital_1645_queries", "change_case__english_lowercase_1122_queries", "change_case__english_lowercase_1361_queries", "change_case__english_lowercase_1019_queries", "change_case__english_lowercase_1087_queries", "change_case__english_lowercase_1667_queries", "change_case__english_lowercase_1516_queries", "change_case__english_lowercase_1535_queries", "change_case__english_lowercase_1593_queries", "change_case__english_lowercase_1843_queries", "keywords__frequency_1393_queries", "keywords__frequency_1733_queries", "keywords__frequency_2142_queries", "keywords__frequency_2292_queries", "keywords__frequency_1498_queries", "keywords__frequency_1203_queries", "keywords__frequency_1857_queries", "length_constraints__number_sentences_1837_queries", "length_constraints__number_sentences_2674_queries", "length_constraints__number_sentences_2617_queries", "length_constraints__number_sentences_1381_queries", "length_constraints__number_sentences_2266_queries", "length_constraints__number_sentences_1268_queries", "length_constraints__number_sentences_179_queries", "length_constraints__number_paragraphs_1236_queries", "length_constraints__number_paragraphs_2941_queries", "length_constraints__number_paragraphs_1248_queries", "length_constraints__number_paragraphs_1858_queries", "length_constraints__number_paragraphs_1377_queries", "length_constraints__number_paragraphs_2357_queries", "length_constraints__number_paragraphs_2921_queries", "length_constraints__number_paragraphs_1082_queries", "length_constraints__number_paragraphs_2467_queries", "combination__two_responses_1591_queries", "combination__two_responses_1793_queries", "combination__two_responses_2912_queries", "combination__two_responses_1332_queries", "combination__two_responses_2383_queries", "combination__two_responses_136_queries", "combination__two_responses_1098_queries", "combination__two_responses_1746_queries", "combination__two_responses_247_queries", "combination__two_responses_2918_queries", "detectable_content__postscript_2273_queries", "detectable_content__postscript_2070_queries", "detectable_content__postscript_1800_queries", "detectable_content__postscript_1305_queries", "detectable_content__postscript_1759_queries", "detectable_content__postscript_1367_queries", "detectable_content__postscript_1537_queries", "detectable_content__postscript_1879_queries", "detectable_content__postscript_1246_queries", "detectable_content__postscript_1620_queries", "startend__end_checker_2398_queries", "startend__end_checker_1902_queries", "startend__end_checker_2268_queries", "startend__end_checker_1659_queries", "startend__end_checker_1893_queries", "startend__end_checker_2475_queries", "startend__end_checker_1128_queries", "startend__end_checker_1939_queries", "startend__end_checker_1446_queries", "startend__end_checker_1220_queries", "detectable_content__number_placeholders_3280_queries", "detectable_content__number_placeholders_1372_queries", "detectable_content__number_placeholders_3221_queries", "detectable_content__number_placeholders_1927_queries", "detectable_content__number_placeholders_3126_queries", "detectable_content__number_placeholders_2164_queries", "detectable_content__number_placeholders_2136_queries", "detectable_content__number_placeholders_2304_queries", "detectable_content__number_placeholders_3743_queries", "length_constraints__number_words_2323_queries", "length_constraints__number_words_1072_queries", "length_constraints__number_words_1258_queries", "length_constraints__number_words_1251_queries", "length_constraints__number_words_164_queries", "detectable_format__number_highlighted_sections_168_queries", "detectable_format__number_highlighted_sections_1237_queries", "detectable_format__number_highlighted_sections_1601_queries", "detectable_format__number_highlighted_sections_167_queries", "detectable_format__number_highlighted_sections_1773_queries", "detectable_format__number_highlighted_sections_1646_queries", "detectable_format__number_highlighted_sections_1379_queries", "detectable_format__number_highlighted_sections_1307_queries", "detectable_format__number_highlighted_sections_1886_queries", "detectable_format__number_highlighted_sections_1644_queries", "detectable_format__json_format_1094_queries", "detectable_format__json_format_1148_queries", "detectable_format__json_format_1137_queries", "detectable_format__json_format_1075_queries", "detectable_format__json_format_2857_queries", "detectable_format__json_format_3223_queries", "detectable_format__json_format_2404_queries", "detectable_format__json_format_321_queries", "detectable_format__json_format_13_queries", "change_case__capital_word_frequency_2820_queries", "change_case__capital_word_frequency_2849_queries", "change_case__capital_word_frequency_2870_queries", "change_case__capital_word_frequency_1592_queries", "detectable_format__multiple_sections_2023_queries", "detectable_format__multiple_sections_1548_queries", "detectable_format__multiple_sections_2925_queries", "detectable_format__multiple_sections_1131_queries", "detectable_format__multiple_sections_357_queries", "startend__quotation_2015_queries", "startend__quotation_219_queries", "startend__quotation_2010_queries", "startend__quotation_1658_queries", "startend__quotation_1325_queries", "startend__quotation_1776_queries", "startend__quotation_2239_queries", "startend__quotation_1845_queries", "startend__quotation_2209_queries", "length_constraints__nth_paragraph_first_word_2880_queries", "length_constraints__nth_paragraph_first_word_181_queries", "length_constraints__nth_paragraph_first_word_2250_queries", "length_constraints__nth_paragraph_first_word_2215_queries", "length_constraints__nth_paragraph_first_word_3073_queries", "length_constraints__nth_paragraph_first_word_2590_queries", "length_constraints__nth_paragraph_first_word_3624_queries", "length_constraints__nth_paragraph_first_word_1954_queries", "detectable_format__title_1262_queries", "detectable_format__title_2229_queries", "detectable_format__title_295_queries", "detectable_format__title_2097_queries", "detectable_format__title_1802_queries", "detectable_format__title_1322_queries", "detectable_format__title_2969_queries", "detectable_format__title_3057_queries", "detectable_format__title_1551_queries", "detectable_format__title_2807_queries"]}' +) -MAIR_TASK_CONFIG = json.loads('{"Competition-Math": "Academic", "ProofWiki_Proof": "Academic", "ProofWiki_Reference": "Academic", "Stacks_Proof": "Academic", "Stacks_Reference": "Academic", "Stein_Proof": "Academic", "Stein_Reference": "Academic", "Trench_Proof": "Academic", "Trench_Reference": "Academic", "TAD": "Academic", "TAS2": "Academic", "StackMathQA": "Academic", "APPS": "Code", "CodeEditSearch": "Code", "CodeSearchNet": "Code", "Conala": "Code", "HumanEval-X": "Code", "LeetCode": "Code", "MBPP": "Code", "RepoBench": "Code", "TLDR": "Code", "SWE-Bench-Lite": "Code", "Apple": "Finance", "ConvFinQA": "Finance", "FinQA": "Finance", "FinanceBench": "Finance", "HC3Finance": "Finance", "TAT-DQA": "Finance", "Trade-the-event": "Finance", "AY2": "Web", "ELI5": "Web", "Fever": "Web", "TREx": "Web", "WnCw": "Web", "WnWi": "Web", "WoW": "Web", "zsRE": "Web", "AILA2019-Case": "Legal", "AILA2019-Statutes": "Legal", "BSARD": "Legal", "BillSum": "Legal", "CUAD": "Legal", "GerDaLIR": "Legal", "LeCaRDv2": "Legal", "LegalQuAD": "Legal", "REGIR-EU2UK": "Legal", "REGIR-UK2EU": "Legal", "ArguAna": "Web", "CQADupStack": "Web", "FiQA": "Finance", "NFCorpus": "Medical", "Quora": "Web", "SciDocs": "Academic", "SciFact": "Academic", "TopiOCQA": "Web", "Touche": "Web", "Trec-Covid": "Medical", "ACORDAR": "Web", "CPCD": "Web", "ChroniclingAmericaQA": "Web", "Monant": "Medical", "NTCIR": "Web", "PointRec": "Web", "ProCIS-Dialog": "Web", "ProCIS-Turn": "Web", "QuanTemp": "Web", "WebTableSearch": "Web", "CARE": "Medical", "MISeD": "Web", "SParC": "Web", "SParC-SQL": "Web", "Spider": "Web", "Spider-SQL": "Web", "LitSearch": "Academic", "CAsT_2019": "Web", "CAsT_2020": "Web", "CAsT_2021": "Web", "CAsT_2022": "Web", "Core_2017": "Web", "Microblog_2011": "Web", "Microblog_2012": "Web", "Microblog_2013": "Web", "Microblog_2014": "Web", "PrecisionMedicine_2017": "Medical", "PrecisionMedicine_2018": "Medical", "PrecisionMedicine_2019": "Medical", "PrecisionMedicine-Article_2019": "Medical", "PrecisionMedicine-Article_2020": "Medical", "CliniDS_2014": "Medical", "CliniDS_2015": "Medical", "CliniDS_2016": "Medical", "ClinicalTrials_2021": "Medical", "ClinicalTrials_2022": "Medical", "ClinicalTrials_2023": "Medical", "DD_2015": "Web", "DD_2016": "Web", "DD_2017": "Web", "FairRanking_2020": "Academic", "FairRanking_2021": "Web", "FairRanking_2022": "Web", "Genomics-AdHoc_2004": "Medical", "Genomics-AdHoc_2005": "Medical", "Genomics-AdHoc_2006": "Medical", "Genomics-AdHoc_2007": "Medical", "TREC-Legal_2011": "Legal", "NeuCLIR-Tech_2023": "Web", "NeuCLIR_2022": "Web", "NeuCLIR_2023": "Web", "ProductSearch_2023": "Web", "ToT_2023": "Web", "ToT_2024": "Web", "FoodAPI": "Code", "HuggingfaceAPI": "Code", "PytorchAPI": "Code", "SpotifyAPI": "Code", "TMDB": "Code", "TensorAPI": "Code", "ToolBench": "Code", "WeatherAPI": "Code", "ExcluIR": "Web", "Core17": "Web", "News21": "Web", "Robust04": "Web", "InstructIR": "Web", "NevIR": "Web", "IFEval": "Web"}') +MAIR_TASK_CONFIG = json.loads( + '{"Competition-Math": "Academic", "ProofWiki_Proof": "Academic", "ProofWiki_Reference": "Academic", "Stacks_Proof": "Academic", "Stacks_Reference": "Academic", "Stein_Proof": "Academic", "Stein_Reference": "Academic", "Trench_Proof": "Academic", "Trench_Reference": "Academic", "TAD": "Academic", "TAS2": "Academic", "StackMathQA": "Academic", "APPS": "Code", "CodeEditSearch": "Code", "CodeSearchNet": "Code", "Conala": "Code", "HumanEval-X": "Code", "LeetCode": "Code", "MBPP": "Code", "RepoBench": "Code", "TLDR": "Code", "SWE-Bench-Lite": "Code", "Apple": "Finance", "ConvFinQA": "Finance", "FinQA": "Finance", "FinanceBench": "Finance", "HC3Finance": "Finance", "TAT-DQA": "Finance", "Trade-the-event": "Finance", "AY2": "Web", "ELI5": "Web", "Fever": "Web", "TREx": "Web", "WnCw": "Web", "WnWi": "Web", "WoW": "Web", "zsRE": "Web", "AILA2019-Case": "Legal", "AILA2019-Statutes": "Legal", "BSARD": "Legal", "BillSum": "Legal", "CUAD": "Legal", "GerDaLIR": "Legal", "LeCaRDv2": "Legal", "LegalQuAD": "Legal", "REGIR-EU2UK": "Legal", "REGIR-UK2EU": "Legal", "ArguAna": "Web", "CQADupStack": "Web", "FiQA": "Finance", "NFCorpus": "Medical", "Quora": "Web", "SciDocs": "Academic", "SciFact": "Academic", "TopiOCQA": "Web", "Touche": "Web", "Trec-Covid": "Medical", "ACORDAR": "Web", "CPCD": "Web", "ChroniclingAmericaQA": "Web", "Monant": "Medical", "NTCIR": "Web", "PointRec": "Web", "ProCIS-Dialog": "Web", "ProCIS-Turn": "Web", "QuanTemp": "Web", "WebTableSearch": "Web", "CARE": "Medical", "MISeD": "Web", "SParC": "Web", "SParC-SQL": "Web", "Spider": "Web", "Spider-SQL": "Web", "LitSearch": "Academic", "CAsT_2019": "Web", "CAsT_2020": "Web", "CAsT_2021": "Web", "CAsT_2022": "Web", "Core_2017": "Web", "Microblog_2011": "Web", "Microblog_2012": "Web", "Microblog_2013": "Web", "Microblog_2014": "Web", "PrecisionMedicine_2017": "Medical", "PrecisionMedicine_2018": "Medical", "PrecisionMedicine_2019": "Medical", "PrecisionMedicine-Article_2019": "Medical", "PrecisionMedicine-Article_2020": "Medical", "CliniDS_2014": "Medical", "CliniDS_2015": "Medical", "CliniDS_2016": "Medical", "ClinicalTrials_2021": "Medical", "ClinicalTrials_2022": "Medical", "ClinicalTrials_2023": "Medical", "DD_2015": "Web", "DD_2016": "Web", "DD_2017": "Web", "FairRanking_2020": "Academic", "FairRanking_2021": "Web", "FairRanking_2022": "Web", "Genomics-AdHoc_2004": "Medical", "Genomics-AdHoc_2005": "Medical", "Genomics-AdHoc_2006": "Medical", "Genomics-AdHoc_2007": "Medical", "TREC-Legal_2011": "Legal", "NeuCLIR-Tech_2023": "Web", "NeuCLIR_2022": "Web", "NeuCLIR_2023": "Web", "ProductSearch_2023": "Web", "ToT_2023": "Web", "ToT_2024": "Web", "FoodAPI": "Code", "HuggingfaceAPI": "Code", "PytorchAPI": "Code", "SpotifyAPI": "Code", "TMDB": "Code", "TensorAPI": "Code", "ToolBench": "Code", "WeatherAPI": "Code", "ExcluIR": "Web", "Core17": "Web", "News21": "Web", "Robust04": "Web", "InstructIR": "Web", "NevIR": "Web", "IFEval": "Web"}' +) _MAIR_CITATION = """@inproceedings{Sun2024MAIR, title={MAIR: A Massive Benchmark for Evaluating Instructed Retrieval}, From 12f49e6bf6b7f4b5d66031e4b5d5ea1cc85ed938 Mon Sep 17 00:00:00 2001 From: Weiwei Sun <68775773+sunnweiwei@users.noreply.github.com> Date: Sun, 10 Nov 2024 03:50:30 -0500 Subject: [PATCH 11/11] Update MAIR.py --- mteb/tasks/MAIR/eng/MAIR.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mteb/tasks/MAIR/eng/MAIR.py b/mteb/tasks/MAIR/eng/MAIR.py index 46a937132..fa6667af9 100644 --- a/mteb/tasks/MAIR/eng/MAIR.py +++ b/mteb/tasks/MAIR/eng/MAIR.py @@ -73,7 +73,6 @@ def load_data(self, **kwargs): item["qid"]: {d["id"]: d["score"] for d in item["labels"]} for item in query_ds[split] } - self.metadata.eval_splits.append(split) self.data_loaded = True