From de8dba39d26df6db3ab7b6fc46780ddb4e183ab3 Mon Sep 17 00:00:00 2001 From: 3 a l i <58257628+alielfilali01@users.noreply.github.com> Date: Wed, 11 Dec 2024 18:57:24 +0400 Subject: [PATCH] Add new Arabic benchmarks (5) and enhance existing tasks (#372) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update arabic_evals.py Add new Arabic benchmarks and update existing tasks - Renamed `arabic_mmlu` to `arabic_mmlu_mt` to highlight its machine-translated origin. - Added new benchmarks: `arabic_mmlu` ArabicMMLU (https://arxiv.org/abs/2402.12840), `arabic_mmlu_ht` (human-translated), and `MadinahQA` from MBZUAI. As well as `arabic_mmmlu` (OpenAI MMMLU), and `AraTrust` a trustworthiness benchmark for Arabic LLMs (https://arxiv.org/abs/2403.09017). - Enhanced prompt functions for better flexibility in answer options. * Update and rename OALL_tasks.txt to OALL_v1_tasks.txt Rename file to refelect that it is v1 leaderboard tasks * Create OALL_v2_tasks.txt Tasks for v2 of OALL * Update all_arabic_tasks.txt add new and renamed tasks * Update arabic_evals.py Fix formatting issues for * Update all_arabic_tasks.txt Add missing task: OpenAI's MMMLU arabic subset * Update all_arabic_tasks.txt Correct order * Update arabic_evals.py remove openai mmmlu task following the discussion here: https://github.com/huggingface/lighteval/pull/372 * Update all_arabic_tasks.txt remove openai mmmlu task following the discussion here: https://github.com/huggingface/lighteval/pull/372 * Update tasks.py Adding a templated version of arabic mmlu based on @hynky1999 request in the #372 PR * Update tasks.py remove arabic_mmlu_templated_tasks --------- Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> --- community_tasks/arabic_evals.py | 354 ++++++++++++++++++-- examples/tasks/OALL_tasks.txt | 136 -------- examples/tasks/OALL_v1_tasks.txt | 136 ++++++++ examples/tasks/OALL_v2_tasks.txt | 117 +++++++ examples/tasks/all_arabic_tasks.txt | 379 ++++++++++++++-------- src/lighteval/tasks/multilingual/tasks.py | 1 + 6 files changed, 817 insertions(+), 306 deletions(-) delete mode 100644 examples/tasks/OALL_tasks.txt create mode 100644 examples/tasks/OALL_v1_tasks.txt create mode 100644 examples/tasks/OALL_v2_tasks.txt diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index f575b5f07..382a780d3 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -39,9 +39,91 @@ LETTER_INDICES_AR = ["أ", "ب", "ج", "د", "هـ", "و", "ز", "ح", "ط", "ي", "ك", "ل", "م", "ن", "س", "ع", "ف", "ص", "ق", "ر", "ش", "ت", "ث", "خ", "ذ", "ض", "ظ", "غ"] # fmt: on -# ARABIC MMLU ## +# ArabicMMLU # fmt: off ARABIC_MMLU_SUBSETS = [ + "All", "Islamic Studies", "Islamic Studies (Middle School)", "Islamic Studies (Primary School)", "Islamic Studies (High School)", "Driving Test", + "Natural Science (Middle School)", "Natural Science (Primary School)", "History (Middle School)", "History (Primary School)", "History (High School)", "General Knowledge", + "General Knowledge (Middle School)", "General Knowledge (Primary School)", "Law (Professional)", "Physics (High School)", "Social Science (Middle School)", + "Social Science (Primary School)", "Management (University)", "Arabic Language (Middle School)", "Arabic Language (Primary School)", "Arabic Language (High School)", "Political Science (University)", + "Philosophy (High School)", "Accounting (University)", "Computer Science (Middle School)", "Computer Science (Primary School)", "Computer Science (High School)", "Computer Science (University)", + "Geography (Middle School)", "Geography (Primary School)", "Geography (High School)", "Math (Primary School)", "Biology (High School)", "Economics (Middle School)", + "Economics (High School)", "Economics (University)", "Arabic Language (General)", "Arabic Language (Grammar)", "Civics (Middle School)", "Civics (High School)" +] +# fmt: on + + +def arabic_mmlu_pfn(line, task_name: str = None): + instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n" + + # Define the mapping from Latin to Arabic letters + latin_to_arabic = {"A": "أ", "B": "ب", "C": "ج", "D": "د", "E": "هـ"} + + # Create a list of valid choices with corresponding Arabic keys + choices = [] + valid_keys_latin = [] + valid_keys_arabic = [] + + # Enumerate through the options and append the valid ones + for idx, key in enumerate(["A", "B", "C", "D", "E"]): + option = line.get(f"Option {idx + 1}") + if option: # Check if option is not null + choices.append(option) + valid_keys_latin.append(key) # Append the Latin key (A, B, C, D, E) + valid_keys_arabic.append(latin_to_arabic[key]) # Append the corresponding Arabic letter + + # Find the correct index for the answer key in the Arabic version + answer_index = valid_keys_latin.index(line["Answer Key"]) + + # Construct the query with Arabic letters + query = f"{instruction}{line['Question']}\n" + query += "".join([f"{key}. {choice}\n" for key, choice in zip(valid_keys_arabic, choices)]) + query += "الإجابة:" + + return Doc( + task_name=task_name, + query=query, + choices=valid_keys_arabic, # Return only valid choices (Arabic keys) + gold_index=answer_index, # Correct index in the valid Arabic keys + instruction=instruction, + target_for_fewshot_sorting=valid_keys_arabic[answer_index], # Correct answer in Arabic form + ) + + +class CustomArabicMMLUTask(LightevalTaskConfig): + def __init__( + self, + name, + hf_subset, + ): + super().__init__( + name=name, + hf_subset=hf_subset, + prompt_function=arabic_mmlu_pfn, + hf_repo="MBZUAI/ArabicMMLU", + metric=[Metrics.loglikelihood_acc_norm], + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=["dev"], + few_shots_select="sequential", + suite=["community"], + generation_size=-1, + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, + ) + + +ARABIC_MMLU_TASKS = [ + CustomArabicMMLUTask(name=f"arabic_mmlu:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_SUBSETS +] + + +# ARABIC MMLU HT ## +# fmt: off +ARABIC_MMLU_HT_SUBSETS = [ "abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge", "college_biology", "college_chemistry", "college_computer_science", "college_mathematics", "college_medicine", "college_physics", "computer_security", "conceptual_physics", "econometrics", "electrical_engineering", "elementary_mathematics", "formal_logic", "global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science", @@ -54,13 +136,78 @@ # fmt: on -def mmlu_arabic(line, task_name: str = None): - topic = line["subject"] - instruction = f"الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح حول {topic.replace('_', ' ')}. \n\n" +def arabic_mmlu_ht_pfn(line, task_name: str = None): + instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n" + choices = line["choices"] + answer_index = line["answer"] # It is an int reflecting the index of correct answer in line["choices"] + + query = f"{instruction}{line['question']}\n" + query += "".join([f"{idx}. {choice}\n" for idx, choice in enumerate(choices, start=1)]) + query += "الإجابة:" + + return Doc( + task_name=task_name, + query=query, + choices=[str(i) for i in range(1, len(choices) + 1)], # List of strings instead of ints + gold_index=answer_index, + instruction=instruction, + target_for_fewshot_sorting=str(answer_index), # Assuming it's sorted based on the number + ) + + +class CustomArabicMMLUHTTask(LightevalTaskConfig): + def __init__( + self, + name, + hf_subset, + ): + super().__init__( + name=name, + hf_subset=hf_subset, + prompt_function=arabic_mmlu_ht_pfn, + hf_repo="MBZUAI/human_translated_arabic_mmlu", + metric=[Metrics.loglikelihood_acc_norm], + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + suite=["community"], + generation_size=-1, + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, + ) + + +ARABIC_MMLU_HT_TASKS = [ + CustomArabicMMLUHTTask(name=f"arabic_mmlu_ht:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_HT_SUBSETS +] + +# ARABIC MMLU MT ## +# fmt: off +ARABIC_MMLU_MT_SUBSETS = [ + "abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge", "college_biology", "college_chemistry", "college_computer_science", + "college_mathematics", "college_medicine", "college_physics", "computer_security", "conceptual_physics", "econometrics", "electrical_engineering", + "elementary_mathematics", "formal_logic", "global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science", + "high_school_european_history", "high_school_geography", "high_school_government_and_politics", "high_school_macroeconomics", "high_school_mathematics", + "high_school_microeconomics", "high_school_physics", "high_school_psychology", "high_school_statistics", "high_school_us_history", "high_school_world_history", + "human_aging", "human_sexuality", "international_law", "jurisprudence", "logical_fallacies", "machine_learning", "management", "marketing", "medical_genetics", + "miscellaneous", "moral_disputes", "moral_scenarios", "nutrition", "philosophy", "prehistory", "professional_accounting", "professional_law", + "professional_medicine", "professional_psychology", "public_relations", "security_studies", "sociology", "us_foreign_policy", "virology", "world_religions" +] +# fmt: on + + +def arabic_mmlu_mt_pfn(line, task_name: str = None): + instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب، ج، أو د... إلخ. \n\n" choices = [line["A"], line["B"], line["C"], line["D"]] # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES, # it will then be applied to arabic letters - gold_ix = LETTER_INDICES.index(line["answer"]) + answer_index = LETTER_INDICES.index( + line["answer"] + ) # line["answer"] is the correct answer. That's why we need to index it ! query = f"{instruction}{line['question']}\n" query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES_AR[:4], choices)]) @@ -70,12 +217,12 @@ def mmlu_arabic(line, task_name: str = None): task_name=task_name, query=query, choices=LETTER_INDICES_AR[:4], - gold_index=gold_ix, + gold_index=answer_index, instruction=instruction, ) -class CustomArabicMMLUTask(LightevalTaskConfig): +class CustomArabicMMLUMTTask(LightevalTaskConfig): def __init__( self, name, @@ -84,7 +231,7 @@ def __init__( super().__init__( name=name, hf_subset=hf_subset, - prompt_function=mmlu_arabic, + prompt_function=arabic_mmlu_mt_pfn, hf_repo="OALL/Arabic_MMLU", metric=[Metrics.loglikelihood_acc_norm], hf_avail_splits=["test", "dev"], @@ -101,10 +248,11 @@ def __init__( ) -ARABIC_MMLU_TASKS = [ - CustomArabicMMLUTask(name=f"arabic_mmlu:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_SUBSETS +ARABIC_MMLU_MT_TASKS = [ + CustomArabicMMLUMTTask(name=f"arabic_mmlu_mt:{subset}", hf_subset=subset) for subset in ARABIC_MMLU_MT_SUBSETS ] + # ACVA ## # fmt: off ACVA_SUBSETS = [ @@ -120,7 +268,7 @@ def __init__( # fmt: on -def acva(line, task_name: str = None): +def acva_pfn(line, task_name: str = None): question = line["question"] answer = line["answer"] @@ -141,7 +289,7 @@ def __init__( super().__init__( name=name, hf_subset=hf_subset, - prompt_function=acva, + prompt_function=acva_pfn, hf_repo="OALL/ACVA", metric=[Metrics.loglikelihood_acc_norm], hf_avail_splits=["test", "validation"], @@ -161,7 +309,69 @@ def __init__( ACVA_TASKS = [CustomACVATask(name=f"acva:{subset}", hf_subset=subset) for subset in ACVA_SUBSETS] -def arabic_exams(line, task_name: str = None): +# AraTrust ## +# fmt: off +ARATRUST_SUBSETS = [ + "Trustfulness", "MentalHealth", "PhysicalHealth", "Offensive", "Ethics", "Privacy", "Unfairness", "Illegal", +] +# fmt: on + + +def aratrust_pfn(line, task_name: str = None): + instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب أو ج. \n\n" + choices = [line["A"], line["B"], line["C"]] + # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES, + # it will then be applied to arabic letters + answer_index = LETTER_INDICES_AR.index( + line["Answer"] + ) # line["answer"] is the correct answer. That's why we need to index it ! + + query = f"{instruction}{line['Question']}\n" + query += "".join([f"{choice}\n" for choice in choices]) + query += "الإجابة:" + + return Doc( + task_name=task_name, + query=query, + choices=LETTER_INDICES_AR[:3], + gold_index=answer_index, + instruction=instruction, + target_for_fewshot_sorting=LETTER_INDICES_AR[answer_index], + ) + + +class CustomAraTrustTask(LightevalTaskConfig): + def __init__( + self, + name, + hf_subset, + ): + super().__init__( + name=name, + hf_subset=hf_subset, + prompt_function=aratrust_pfn, + hf_repo="asas-ai/AraTrust-categorized", + metric=[ + Metrics.f1_score + ], # Following the paper (AraTrust: An Evaluation of Trustworthiness for LLMs in Arabic)[https://arxiv.org/abs/2403.09017] + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + suite=["community"], + generation_size=-1, + stop_sequence=[], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, + ) + + +ARATRUST_TASKS = [CustomAraTrustTask(name=f"aratrust:{subset}", hf_subset=subset) for subset in ARATRUST_SUBSETS] + + +def arabic_exams_pfn(line, task_name: str = None): topic = line["subject"] question = line["question"] choices = [line["A"], line["B"], line["C"], line["D"]] @@ -186,7 +396,7 @@ def arabic_exams(line, task_name: str = None): # ARABIC EXAMS ## arabic_exams_task = LightevalTaskConfig( name="arabic_exams", - prompt_function=arabic_exams, + prompt_function=arabic_exams_pfn, suite=["community"], hf_repo="OALL/Arabic_EXAMS", hf_subset="default", @@ -210,7 +420,7 @@ def arabic_exams(line, task_name: str = None): # fmt: on -def alghafa_prompt(line, task_name: str = None): +def alghafa_pfn(line, task_name: str = None): question = line["query"] answer_index = int(line["label"]) # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' @@ -241,7 +451,7 @@ def __init__( super().__init__( name=name, hf_subset=hf_subset, - prompt_function=alghafa_prompt, + prompt_function=alghafa_pfn, hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native", metric=[Metrics.loglikelihood_acc_norm], hf_avail_splits=["test", "validation"], @@ -253,6 +463,7 @@ def __init__( stop_sequence=None, output_regex=None, frozen=False, + trust_dataset=True, version=0, ) @@ -263,7 +474,7 @@ def __init__( # race_ar race_ar_task = LightevalTaskConfig( name="race_ar", - prompt_function=alghafa_prompt, + prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="race_ar", @@ -280,7 +491,7 @@ def __init__( # piqa_ar piqa_ar_task = LightevalTaskConfig( name="piqa_ar", - prompt_function=alghafa_prompt, + prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="piqa_ar", @@ -297,7 +508,7 @@ def __init__( # arc_easy_ar arc_easy_ar_task = LightevalTaskConfig( name="arc_easy_ar", - prompt_function=alghafa_prompt, + prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="arc_easy_ar", @@ -314,7 +525,7 @@ def __init__( # arc_challenge_okapi_ar arc_challenge_okapi_ar_task = LightevalTaskConfig( name="arc_challenge_okapi_ar", - prompt_function=alghafa_prompt, + prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="arc_challenge_okapi_ar", @@ -331,7 +542,7 @@ def __init__( # mmlu_okapi_ar mmlu_okapi_ar_task = LightevalTaskConfig( name="mmlu_okapi_ar", - prompt_function=alghafa_prompt, + prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="mmlu_okapi_ar", @@ -348,7 +559,7 @@ def __init__( # openbook_qa_ext_ar openbook_qa_ext_ar_task = LightevalTaskConfig( name="openbook_qa_ext_ar", - prompt_function=alghafa_prompt, + prompt_function=alghafa_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="openbook_qa_ext_ar", @@ -363,9 +574,7 @@ def __init__( # boolq_ar - - -def boolq_prompt_arabic(line, task_name: str = None): +def boolq_arabic_pfn(line, task_name: str = None): question = line["question"] passage = line["passage"] instruction = "بناء على المقطع التالي، أجب عن السؤال ب نعم أو لا" @@ -388,7 +597,7 @@ def boolq_prompt_arabic(line, task_name: str = None): boolq_ar_task = LightevalTaskConfig( name="boolq_ar", - prompt_function=boolq_prompt_arabic, + prompt_function=boolq_arabic_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="boolq_ar", @@ -403,7 +612,7 @@ def boolq_prompt_arabic(line, task_name: str = None): # copa_ext_ar -def copa_prompt_arabic(line, task_name: str = None): +def copa_arabic_pfn(line, task_name: str = None): premise = line["premise"] choices = [line["choice1"], line["choice2"]] question_map = {"cause": "لأن", "effect": "لذلك"} @@ -423,7 +632,7 @@ def copa_prompt_arabic(line, task_name: str = None): copa_ext_ar_task = LightevalTaskConfig( name="copa_ext_ar", - prompt_function=copa_prompt_arabic, + prompt_function=copa_arabic_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="copa_ext_ar", @@ -438,7 +647,7 @@ def copa_prompt_arabic(line, task_name: str = None): # hellaswag_okapi_ar -def hellaswag_prompt_arabic(line, task_name: str = None): +def hellaswag_arabic_pfn(line, task_name: str = None): ctx = re.sub(r"\[.*?\]", "", line["ctx"]) # Remove latin words within brackets endings = [ re.sub(r"\[.*?\]", "", e) for e in eval(line["endings"]) @@ -467,7 +676,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None): hellaswag_okapi_ar_task = LightevalTaskConfig( name="hellaswag_okapi_ar", - prompt_function=hellaswag_prompt_arabic, + prompt_function=hellaswag_arabic_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="hellaswag_okapi_ar", @@ -482,7 +691,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None): # toxigen_ar -def toxigen_prompt_arabic(line, task_name: str = None): +def toxigen_arabic_pfn(line, task_name: str = None): text = line["text"] label = 1 if ((line["toxicity_ai"] + line["toxicity_human"]) > 5.5) else 0 instruction = 'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".' @@ -504,7 +713,7 @@ def toxigen_prompt_arabic(line, task_name: str = None): toxigen_ar_task = LightevalTaskConfig( name="toxigen_ar", - prompt_function=toxigen_prompt_arabic, + prompt_function=toxigen_arabic_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="toxigen_ar", @@ -519,7 +728,7 @@ def toxigen_prompt_arabic(line, task_name: str = None): # sciq_ar -def sciq_prompt_arabic(line, task_name: str = None): +def sciq_arabic_pfn(line, task_name: str = None): support = line["support"] question = line["question"] correct_answer = line["correct_answer"] @@ -555,7 +764,7 @@ def sciq_prompt_arabic(line, task_name: str = None): sciq_ar_task = LightevalTaskConfig( name="sciq_ar", - prompt_function=sciq_prompt_arabic, + prompt_function=sciq_arabic_pfn, suite=["community"], hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", hf_subset="sciq_ar", @@ -569,10 +778,87 @@ def sciq_prompt_arabic(line, task_name: str = None): ) +# madinah_qa +# fmt: off +MADINAH_QA_SUBSETS = ["Arabic Language (General)", "Arabic Language (Grammar)"] +# fmt: on + + +def madinah_qa_pfn(line, task_name: str = None): + instruction = "السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة:\n\n" + + # Define the mapping from Latin to Arabic letters + latin_to_arabic = {"A": "أ", "B": "ب", "C": "ج", "D": "د", "E": "هـ"} + + # Create a list of valid choices with corresponding Arabic keys + choices = [] + valid_keys_latin = [] + valid_keys_arabic = [] + + # Enumerate through the options and append the valid ones + for idx, key in enumerate(["A", "B", "C", "D", "E"]): + option = line.get(f"Option {idx + 1}") + if option: # Check if option is not null + choices.append(option) + valid_keys_latin.append(key) # Append the Latin key (A, B, C, D, E) + valid_keys_arabic.append(latin_to_arabic[key]) # Append the corresponding Arabic letter + + # Find the correct index for the answer key in the Arabic version + answer_index = valid_keys_latin.index(line["Answer Key"]) + + query = f"{instruction}{line['Question']}\n" + query += "".join([f"{key}. {choice}\n" for key, choice in zip(valid_keys_arabic, choices)]) + query += "الإجابة:" + + return Doc( + task_name=task_name, + query=query, + choices=choices, + gold_index=answer_index, # Correct index in the valid keys + instruction=instruction, + target_for_fewshot_sorting=valid_keys_latin[answer_index], # Correct answer in Latin form + ) + + +class CustomMadinahQATask(LightevalTaskConfig): + def __init__( + self, + name, + hf_subset, + ): + super().__init__( + name=name, + hf_subset=hf_subset, + prompt_function=madinah_qa_pfn, + hf_repo="MBZUAI/MadinahQA", + metric=[Metrics.loglikelihood_acc_norm], + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=["dev"], + few_shots_select="sequential", + suite=["community"], + generation_size=-1, + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, + ) + + +MADINAH_QA_TASKS = [ + CustomMadinahQATask(name=f"madinah_qa:{subset}", hf_subset=subset) for subset in MADINAH_QA_SUBSETS +] + + TASKS_TABLE = ( ARABIC_MMLU_TASKS + + ARABIC_MMLU_HT_TASKS + + ARABIC_MMLU_MT_TASKS + ACVA_TASKS + ALGHAFA_TASKS + + ARATRUST_TASKS + + MADINAH_QA_TASKS + [arabic_exams_task] + [race_ar_task] + [piqa_ar_task] diff --git a/examples/tasks/OALL_tasks.txt b/examples/tasks/OALL_tasks.txt deleted file mode 100644 index 346d062c6..000000000 --- a/examples/tasks/OALL_tasks.txt +++ /dev/null @@ -1,136 +0,0 @@ -lighteval|xstory_cloze:ar|0|0 -community|arabic_mmlu:abstract_algebra|5|1 -community|arabic_mmlu:anatomy|5|1 -community|arabic_mmlu:astronomy|5|1 -community|arabic_mmlu:business_ethics|5|1 -community|arabic_mmlu:clinical_knowledge|5|1 -community|arabic_mmlu:college_biology|5|1 -community|arabic_mmlu:college_chemistry|5|1 -community|arabic_mmlu:college_computer_science|5|1 -community|arabic_mmlu:college_mathematics|5|1 -community|arabic_mmlu:college_medicine|5|1 -community|arabic_mmlu:college_physics|5|1 -community|arabic_mmlu:computer_security|5|1 -community|arabic_mmlu:conceptual_physics|5|1 -community|arabic_mmlu:econometrics|5|1 -community|arabic_mmlu:electrical_engineering|5|1 -community|arabic_mmlu:elementary_mathematics|5|1 -community|arabic_mmlu:formal_logic|5|1 -community|arabic_mmlu:global_facts|5|1 -community|arabic_mmlu:high_school_biology|5|1 -community|arabic_mmlu:high_school_chemistry|5|1 -community|arabic_mmlu:high_school_computer_science|5|1 -community|arabic_mmlu:high_school_european_history|5|1 -community|arabic_mmlu:high_school_geography|5|1 -community|arabic_mmlu:high_school_government_and_politics|5|1 -community|arabic_mmlu:high_school_macroeconomics|5|1 -community|arabic_mmlu:high_school_mathematics|5|1 -community|arabic_mmlu:high_school_microeconomics|5|1 -community|arabic_mmlu:high_school_physics|5|1 -community|arabic_mmlu:high_school_psychology|5|1 -community|arabic_mmlu:high_school_statistics|5|1 -community|arabic_mmlu:high_school_us_history|5|1 -community|arabic_mmlu:high_school_world_history|5|1 -community|arabic_mmlu:human_aging|5|1 -community|arabic_mmlu:human_sexuality|5|1 -community|arabic_mmlu:international_law|5|1 -community|arabic_mmlu:jurisprudence|5|1 -community|arabic_mmlu:logical_fallacies|5|1 -community|arabic_mmlu:machine_learning|5|1 -community|arabic_mmlu:management|5|1 -community|arabic_mmlu:marketing|5|1 -community|arabic_mmlu:medical_genetics|5|1 -community|arabic_mmlu:miscellaneous|5|1 -community|arabic_mmlu:moral_disputes|5|1 -community|arabic_mmlu:moral_scenarios|5|1 -community|arabic_mmlu:nutrition|5|1 -community|arabic_mmlu:philosophy|5|1 -community|arabic_mmlu:prehistory|5|1 -community|arabic_mmlu:professional_accounting|5|1 -community|arabic_mmlu:professional_law|5|1 -community|arabic_mmlu:professional_medicine|5|1 -community|arabic_mmlu:professional_psychology|5|1 -community|arabic_mmlu:public_relations|5|1 -community|arabic_mmlu:security_studies|5|1 -community|arabic_mmlu:sociology|5|1 -community|arabic_mmlu:us_foreign_policy|5|1 -community|arabic_mmlu:virology|5|1 -community|arabic_mmlu:world_religions|5|1 -community|arabic_exams|5|1 -community|acva:Algeria|5|1 -community|acva:Ancient_Egypt|5|1 -community|acva:Arab_Empire|5|1 -community|acva:Arabic_Architecture|5|1 -community|acva:Arabic_Art|5|1 -community|acva:Arabic_Astronomy|5|1 -community|acva:Arabic_Calligraphy|5|1 -community|acva:Arabic_Ceremony|5|1 -community|acva:Arabic_Clothing|5|1 -community|acva:Arabic_Culture|5|1 -community|acva:Arabic_Food|5|1 -community|acva:Arabic_Funeral|5|1 -community|acva:Arabic_Geography|5|1 -community|acva:Arabic_History|5|1 -community|acva:Arabic_Language_Origin|5|1 -community|acva:Arabic_Literature|5|1 -community|acva:Arabic_Math|5|1 -community|acva:Arabic_Medicine|5|1 -community|acva:Arabic_Music|5|1 -community|acva:Arabic_Ornament|5|1 -community|acva:Arabic_Philosophy|5|1 -community|acva:Arabic_Physics_and_Chemistry|5|1 -community|acva:Arabic_Wedding|5|1 -community|acva:Bahrain|5|1 -community|acva:Comoros|5|1 -community|acva:Egypt_modern|5|1 -community|acva:InfluenceFromAncientEgypt|5|1 -community|acva:InfluenceFromByzantium|5|1 -community|acva:InfluenceFromChina|5|1 -community|acva:InfluenceFromGreece|5|1 -community|acva:InfluenceFromIslam|5|1 -community|acva:InfluenceFromPersia|5|1 -community|acva:InfluenceFromRome|5|1 -community|acva:Iraq|5|1 -community|acva:Islam_Education|5|1 -community|acva:Islam_branches_and_schools|5|1 -community|acva:Islamic_law_system|5|1 -community|acva:Jordan|5|1 -community|acva:Kuwait|5|1 -community|acva:Lebanon|5|1 -community|acva:Libya|5|1 -community|acva:Mauritania|5|1 -community|acva:Mesopotamia_civilization|5|1 -community|acva:Morocco|5|1 -community|acva:Oman|5|1 -community|acva:Palestine|5|1 -community|acva:Qatar|5|1 -community|acva:Saudi_Arabia|5|1 -community|acva:Somalia|5|1 -community|acva:Sudan|5|1 -community|acva:Syria|5|1 -community|acva:Tunisia|5|1 -community|acva:United_Arab_Emirates|5|1 -community|acva:Yemen|5|1 -community|acva:communication|5|1 -community|acva:computer_and_phone|5|1 -community|acva:daily_life|5|1 -community|acva:entertainment|5|1 -community|alghafa:mcq_exams_test_ar|5|1 -community|alghafa:meta_ar_dialects|5|1 -community|alghafa:meta_ar_msa|5|1 -community|alghafa:multiple_choice_facts_truefalse_balanced_task|5|1 -community|alghafa:multiple_choice_grounded_statement_soqal_task|5|1 -community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1 -community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1 -community|alghafa:multiple_choice_rating_sentiment_task|5|1 -community|alghafa:multiple_choice_sentiment_task|5|1 -community|race_ar|5|1 -community|piqa_ar|5|1 -community|arc_easy_ar|5|1 -community|arc_challenge_okapi_ar|5|1 -community|openbook_qa_ext_ar|5|1 -community|boolq_ar|5|1 -community|copa_ext_ar|5|1 -community|hellaswag_okapi_ar|5|1 -community|toxigen_ar|5|1 -community|sciq_ar|5|1 diff --git a/examples/tasks/OALL_v1_tasks.txt b/examples/tasks/OALL_v1_tasks.txt new file mode 100644 index 000000000..08e9a51cd --- /dev/null +++ b/examples/tasks/OALL_v1_tasks.txt @@ -0,0 +1,136 @@ +lighteval|xstory_cloze:ar|0|0 +community|arabic_mmlu_mt:abstract_algebra|0|0 +community|arabic_mmlu_mt:anatomy|0|0 +community|arabic_mmlu_mt:astronomy|0|0 +community|arabic_mmlu_mt:business_ethics|0|0 +community|arabic_mmlu_mt:clinical_knowledge|0|0 +community|arabic_mmlu_mt:college_biology|0|0 +community|arabic_mmlu_mt:college_chemistry|0|0 +community|arabic_mmlu_mt:college_computer_science|0|0 +community|arabic_mmlu_mt:college_mathematics|0|0 +community|arabic_mmlu_mt:college_medicine|0|0 +community|arabic_mmlu_mt:college_physics|0|0 +community|arabic_mmlu_mt:computer_security|0|0 +community|arabic_mmlu_mt:conceptual_physics|0|0 +community|arabic_mmlu_mt:econometrics|0|0 +community|arabic_mmlu_mt:electrical_engineering|0|0 +community|arabic_mmlu_mt:elementary_mathematics|0|0 +community|arabic_mmlu_mt:formal_logic|0|0 +community|arabic_mmlu_mt:global_facts|0|0 +community|arabic_mmlu_mt:high_school_biology|0|0 +community|arabic_mmlu_mt:high_school_chemistry|0|0 +community|arabic_mmlu_mt:high_school_computer_science|0|0 +community|arabic_mmlu_mt:high_school_european_history|0|0 +community|arabic_mmlu_mt:high_school_geography|0|0 +community|arabic_mmlu_mt:high_school_government_and_politics|0|0 +community|arabic_mmlu_mt:high_school_macroeconomics|0|0 +community|arabic_mmlu_mt:high_school_mathematics|0|0 +community|arabic_mmlu_mt:high_school_microeconomics|0|0 +community|arabic_mmlu_mt:high_school_physics|0|0 +community|arabic_mmlu_mt:high_school_psychology|0|0 +community|arabic_mmlu_mt:high_school_statistics|0|0 +community|arabic_mmlu_mt:high_school_us_history|0|0 +community|arabic_mmlu_mt:high_school_world_history|0|0 +community|arabic_mmlu_mt:human_aging|0|0 +community|arabic_mmlu_mt:human_sexuality|0|0 +community|arabic_mmlu_mt:international_law|0|0 +community|arabic_mmlu_mt:jurisprudence|0|0 +community|arabic_mmlu_mt:logical_fallacies|0|0 +community|arabic_mmlu_mt:machine_learning|0|0 +community|arabic_mmlu_mt:management|0|0 +community|arabic_mmlu_mt:marketing|0|0 +community|arabic_mmlu_mt:medical_genetics|0|0 +community|arabic_mmlu_mt:miscellaneous|0|0 +community|arabic_mmlu_mt:moral_disputes|0|0 +community|arabic_mmlu_mt:moral_scenarios|0|0 +community|arabic_mmlu_mt:nutrition|0|0 +community|arabic_mmlu_mt:philosophy|0|0 +community|arabic_mmlu_mt:prehistory|0|0 +community|arabic_mmlu_mt:professional_accounting|0|0 +community|arabic_mmlu_mt:professional_law|0|0 +community|arabic_mmlu_mt:professional_medicine|0|0 +community|arabic_mmlu_mt:professional_psychology|0|0 +community|arabic_mmlu_mt:public_relations|0|0 +community|arabic_mmlu_mt:security_studies|0|0 +community|arabic_mmlu_mt:sociology|0|0 +community|arabic_mmlu_mt:us_foreign_policy|0|0 +community|arabic_mmlu_mt:virology|0|0 +community|arabic_mmlu_mt:world_religions|0|0 +community|arabic_exams|0|0 +community|acva:Algeria|0|0 +community|acva:Ancient_Egypt|0|0 +community|acva:Arab_Empire|0|0 +community|acva:Arabic_Architecture|0|0 +community|acva:Arabic_Art|0|0 +community|acva:Arabic_Astronomy|0|0 +community|acva:Arabic_Calligraphy|0|0 +community|acva:Arabic_Ceremony|0|0 +community|acva:Arabic_Clothing|0|0 +community|acva:Arabic_Culture|0|0 +community|acva:Arabic_Food|0|0 +community|acva:Arabic_Funeral|0|0 +community|acva:Arabic_Geography|0|0 +community|acva:Arabic_History|0|0 +community|acva:Arabic_Language_Origin|0|0 +community|acva:Arabic_Literature|0|0 +community|acva:Arabic_Math|0|0 +community|acva:Arabic_Medicine|0|0 +community|acva:Arabic_Music|0|0 +community|acva:Arabic_Ornament|0|0 +community|acva:Arabic_Philosophy|0|0 +community|acva:Arabic_Physics_and_Chemistry|0|0 +community|acva:Arabic_Wedding|0|0 +community|acva:Bahrain|0|0 +community|acva:Comoros|0|0 +community|acva:Egypt_modern|0|0 +community|acva:InfluenceFromAncientEgypt|0|0 +community|acva:InfluenceFromByzantium|0|0 +community|acva:InfluenceFromChina|0|0 +community|acva:InfluenceFromGreece|0|0 +community|acva:InfluenceFromIslam|0|0 +community|acva:InfluenceFromPersia|0|0 +community|acva:InfluenceFromRome|0|0 +community|acva:Iraq|0|0 +community|acva:Islam_Education|0|0 +community|acva:Islam_branches_and_schools|0|0 +community|acva:Islamic_law_system|0|0 +community|acva:Jordan|0|0 +community|acva:Kuwait|0|0 +community|acva:Lebanon|0|0 +community|acva:Libya|0|0 +community|acva:Mauritania|0|0 +community|acva:Mesopotamia_civilization|0|0 +community|acva:Morocco|0|0 +community|acva:Oman|0|0 +community|acva:Palestine|0|0 +community|acva:Qatar|0|0 +community|acva:Saudi_Arabia|0|0 +community|acva:Somalia|0|0 +community|acva:Sudan|0|0 +community|acva:Syria|0|0 +community|acva:Tunisia|0|0 +community|acva:United_Arab_Emirates|0|0 +community|acva:Yemen|0|0 +community|acva:communication|0|0 +community|acva:computer_and_phone|0|0 +community|acva:daily_life|0|0 +community|acva:entertainment|0|0 +community|alghafa:mcq_exams_test_ar|0|0 +community|alghafa:meta_ar_dialects|0|0 +community|alghafa:meta_ar_msa|0|0 +community|alghafa:multiple_choice_facts_truefalse_balanced_task|0|0 +community|alghafa:multiple_choice_grounded_statement_soqal_task|0|0 +community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0|0 +community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0|0 +community|alghafa:multiple_choice_rating_sentiment_task|0|0 +community|alghafa:multiple_choice_sentiment_task|0|0 +community|race_ar|0|0 +community|piqa_ar|0|0 +community|arc_easy_ar|0|0 +community|arc_challenge_okapi_ar|0|0 +community|openbook_qa_ext_ar|0|0 +community|boolq_ar|0|0 +community|copa_ext_ar|0|0 +community|hellaswag_okapi_ar|0|0 +community|toxigen_ar|0|0 +community|sciq_ar|0|0 diff --git a/examples/tasks/OALL_v2_tasks.txt b/examples/tasks/OALL_v2_tasks.txt new file mode 100644 index 000000000..fc1b4f7e9 --- /dev/null +++ b/examples/tasks/OALL_v2_tasks.txt @@ -0,0 +1,117 @@ +community|alghafa:meta_ar_dialects|0|0 +community|alghafa:meta_ar_msa|0|0 +community|alghafa:mcq_exams_test_ar|0|0 +community|alghafa:multiple_choice_facts_truefalse_balanced_task|0|0 +community|alghafa:multiple_choice_grounded_statement_soqal_task|0|0 +community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0|0 +community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0|0 +community|alghafa:multiple_choice_rating_sentiment_task|0|0 +community|alghafa:multiple_choice_sentiment_task|0|0 +community|arabic_exams|0|0 +community|arabic_mmlu:Islamic Studies|0|0 +community|arabic_mmlu:Islamic Studies (Middle School)|0|0 +community|arabic_mmlu:Islamic Studies (Primary School)|0|0 +community|arabic_mmlu:Islamic Studies (High School)|0|0 +community|arabic_mmlu:Driving Test|0|0 +community|arabic_mmlu:Natural Science (Middle School)|0|0 +community|arabic_mmlu:Natural Science (Primary School)|0|0 +community|arabic_mmlu:History (Middle School)|0|0 +community|arabic_mmlu:History (Primary School)|0|0 +community|arabic_mmlu:History (High School)|0|0 +community|arabic_mmlu:General Knowledge|0|0 +community|arabic_mmlu:General Knowledge (Middle School)|0|0 +community|arabic_mmlu:General Knowledge (Primary School)|0|0 +community|arabic_mmlu:Law (Professional)|0|0 +community|arabic_mmlu:Physics (High School)|0|0 +community|arabic_mmlu:Social Science (Middle School)|0|0 +community|arabic_mmlu:Social Science (Primary School)|0|0 +community|arabic_mmlu:Management (University)|0|0 +community|arabic_mmlu:Arabic Language (Middle School)|0|0 +community|arabic_mmlu:Arabic Language (Primary School)|0|0 +community|arabic_mmlu:Arabic Language (High School)|0|0 +community|arabic_mmlu:Political Science (University)|0|0 +community|arabic_mmlu:Philosophy (High School)|0|0 +community|arabic_mmlu:Accounting (University)|0|0 +community|arabic_mmlu:Computer Science (Middle School)|0|0 +community|arabic_mmlu:Computer Science (Primary School)|0|0 +community|arabic_mmlu:Computer Science (High School)|0|0 +community|arabic_mmlu:Computer Science (University)|0|0 +community|arabic_mmlu:Geography (Middle School)|0|0 +community|arabic_mmlu:Geography (Primary School)|0|0 +community|arabic_mmlu:Geography (High School)|0|0 +community|arabic_mmlu:Math (Primary School)|0|0 +community|arabic_mmlu:Biology (High School)|0|0 +community|arabic_mmlu:Economics (Middle School)|0|0 +community|arabic_mmlu:Economics (High School)|0|0 +community|arabic_mmlu:Economics (University)|0|0 +community|arabic_mmlu:Arabic Language (General)|0|0 +community|arabic_mmlu:Arabic Language (Grammar)|0|0 +community|arabic_mmlu:Civics (Middle School)|0|0 +community|arabic_mmlu:Civics (High School)|0|0 +community|madinah_qa:Arabic Language (General)|0|0 +community|madinah_qa:Arabic Language (Grammar)|0|0 +community|aratrust:Trustfulness|0|0 +community|aratrust:MentalHealth|0|0 +community|aratrust:PhysicalHealth|0|0 +community|aratrust:Offensive|0|0 +community|aratrust:Ethics|0|0 +community|aratrust:Privacy|0|0 +community|aratrust:Unfairness|0|0 +community|aratrust:Illegal|0|0 +community|arabic_mmlu_ht:abstract_algebra|0|0 +community|arabic_mmlu_ht:anatomy|0|0 +community|arabic_mmlu_ht:astronomy|0|0 +community|arabic_mmlu_ht:business_ethics|0|0 +community|arabic_mmlu_ht:clinical_knowledge|0|0 +community|arabic_mmlu_ht:college_biology|0|0 +community|arabic_mmlu_ht:college_chemistry|0|0 +community|arabic_mmlu_ht:college_computer_science|0|0 +community|arabic_mmlu_ht:college_mathematics|0|0 +community|arabic_mmlu_ht:college_medicine|0|0 +community|arabic_mmlu_ht:college_physics|0|0 +community|arabic_mmlu_ht:computer_security|0|0 +community|arabic_mmlu_ht:conceptual_physics|0|0 +community|arabic_mmlu_ht:econometrics|0|0 +community|arabic_mmlu_ht:electrical_engineering|0|0 +community|arabic_mmlu_ht:elementary_mathematics|0|0 +community|arabic_mmlu_ht:formal_logic|0|0 +community|arabic_mmlu_ht:global_facts|0|0 +community|arabic_mmlu_ht:high_school_biology|0|0 +community|arabic_mmlu_ht:high_school_chemistry|0|0 +community|arabic_mmlu_ht:high_school_computer_science|0|0 +community|arabic_mmlu_ht:high_school_european_history|0|0 +community|arabic_mmlu_ht:high_school_geography|0|0 +community|arabic_mmlu_ht:high_school_government_and_politics|0|0 +community|arabic_mmlu_ht:high_school_macroeconomics|0|0 +community|arabic_mmlu_ht:high_school_mathematics|0|0 +community|arabic_mmlu_ht:high_school_microeconomics|0|0 +community|arabic_mmlu_ht:high_school_physics|0|0 +community|arabic_mmlu_ht:high_school_psychology|0|0 +community|arabic_mmlu_ht:high_school_statistics|0|0 +community|arabic_mmlu_ht:high_school_us_history|0|0 +community|arabic_mmlu_ht:high_school_world_history|0|0 +community|arabic_mmlu_ht:human_aging|0|0 +community|arabic_mmlu_ht:human_sexuality|0|0 +community|arabic_mmlu_ht:international_law|0|0 +community|arabic_mmlu_ht:jurisprudence|0|0 +community|arabic_mmlu_ht:logical_fallacies|0|0 +community|arabic_mmlu_ht:machine_learning|0|0 +community|arabic_mmlu_ht:management|0|0 +community|arabic_mmlu_ht:marketing|0|0 +community|arabic_mmlu_ht:medical_genetics|0|0 +community|arabic_mmlu_ht:miscellaneous|0|0 +community|arabic_mmlu_ht:moral_disputes|0|0 +community|arabic_mmlu_ht:moral_scenarios|0|0 +community|arabic_mmlu_ht:nutrition|0|0 +community|arabic_mmlu_ht:philosophy|0|0 +community|arabic_mmlu_ht:prehistory|0|0 +community|arabic_mmlu_ht:professional_accounting|0|0 +community|arabic_mmlu_ht:professional_law|0|0 +community|arabic_mmlu_ht:professional_medicine|0|0 +community|arabic_mmlu_ht:professional_psychology|0|0 +community|arabic_mmlu_ht:public_relations|0|0 +community|arabic_mmlu_ht:security_studies|0|0 +community|arabic_mmlu_ht:sociology|0|0 +community|arabic_mmlu_ht:us_foreign_policy|0|0 +community|arabic_mmlu_ht:virology|0|0 +community|arabic_mmlu_ht:world_religions|0|0 diff --git a/examples/tasks/all_arabic_tasks.txt b/examples/tasks/all_arabic_tasks.txt index fa430ed14..8593fa2f8 100644 --- a/examples/tasks/all_arabic_tasks.txt +++ b/examples/tasks/all_arabic_tasks.txt @@ -1,137 +1,244 @@ lighteval|xstory_cloze:ar|0|0 -community|arabic_mmlu:abstract_algebra|5|1 -community|arabic_mmlu:anatomy|5|1 -community|arabic_mmlu:astronomy|5|1 -community|arabic_mmlu:business_ethics|5|1 -community|arabic_mmlu:clinical_knowledge|5|1 -community|arabic_mmlu:college_biology|5|1 -community|arabic_mmlu:college_chemistry|5|1 -community|arabic_mmlu:college_computer_science|5|1 -community|arabic_mmlu:college_mathematics|5|1 -community|arabic_mmlu:college_medicine|5|1 -community|arabic_mmlu:college_physics|5|1 -community|arabic_mmlu:computer_security|5|1 -community|arabic_mmlu:conceptual_physics|5|1 -community|arabic_mmlu:econometrics|5|1 -community|arabic_mmlu:electrical_engineering|5|1 -community|arabic_mmlu:elementary_mathematics|5|1 -community|arabic_mmlu:formal_logic|5|1 -community|arabic_mmlu:global_facts|5|1 -community|arabic_mmlu:high_school_biology|5|1 -community|arabic_mmlu:high_school_chemistry|5|1 -community|arabic_mmlu:high_school_computer_science|5|1 -community|arabic_mmlu:high_school_european_history|5|1 -community|arabic_mmlu:high_school_geography|5|1 -community|arabic_mmlu:high_school_government_and_politics|5|1 -community|arabic_mmlu:high_school_macroeconomics|5|1 -community|arabic_mmlu:high_school_mathematics|5|1 -community|arabic_mmlu:high_school_microeconomics|5|1 -community|arabic_mmlu:high_school_physics|5|1 -community|arabic_mmlu:high_school_psychology|5|1 -community|arabic_mmlu:high_school_statistics|5|1 -community|arabic_mmlu:high_school_us_history|5|1 -community|arabic_mmlu:high_school_world_history|5|1 -community|arabic_mmlu:human_aging|5|1 -community|arabic_mmlu:human_sexuality|5|1 -community|arabic_mmlu:international_law|5|1 -community|arabic_mmlu:jurisprudence|5|1 -community|arabic_mmlu:logical_fallacies|5|1 -community|arabic_mmlu:machine_learning|5|1 -community|arabic_mmlu:management|5|1 -community|arabic_mmlu:marketing|5|1 -community|arabic_mmlu:medical_genetics|5|1 -community|arabic_mmlu:miscellaneous|5|1 -community|arabic_mmlu:moral_disputes|5|1 -community|arabic_mmlu:moral_scenarios|5|1 -community|arabic_mmlu:nutrition|5|1 -community|arabic_mmlu:philosophy|5|1 -community|arabic_mmlu:prehistory|5|1 -community|arabic_mmlu:professional_accounting|5|1 -community|arabic_mmlu:professional_law|5|1 -community|arabic_mmlu:professional_medicine|5|1 -community|arabic_mmlu:professional_psychology|5|1 -community|arabic_mmlu:public_relations|5|1 -community|arabic_mmlu:security_studies|5|1 -community|arabic_mmlu:sociology|5|1 -community|arabic_mmlu:us_foreign_policy|5|1 -community|arabic_mmlu:virology|5|1 -community|arabic_mmlu:world_religions|5|1 -community|arabic_exams|5|1 -community|acva:Algeria|5|1 -community|acva:Ancient_Egypt|5|1 -community|acva:Arab_Empire|5|1 -community|acva:Arabic_Architecture|5|1 -community|acva:Arabic_Art|5|1 -community|acva:Arabic_Astronomy|5|1 -community|acva:Arabic_Calligraphy|5|1 -community|acva:Arabic_Ceremony|5|1 -community|acva:Arabic_Clothing|5|1 -community|acva:Arabic_Culture|5|1 -community|acva:Arabic_Food|5|1 -community|acva:Arabic_Funeral|5|1 -community|acva:Arabic_Geography|5|1 -community|acva:Arabic_History|5|1 -community|acva:Arabic_Language_Origin|5|1 -community|acva:Arabic_Literature|5|1 -community|acva:Arabic_Math|5|1 -community|acva:Arabic_Medicine|5|1 -community|acva:Arabic_Music|5|1 -community|acva:Arabic_Ornament|5|1 -community|acva:Arabic_Philosophy|5|1 -community|acva:Arabic_Physics_and_Chemistry|5|1 -community|acva:Arabic_Wedding|5|1 -community|acva:Bahrain|5|1 -community|acva:Comoros|5|1 -community|acva:Egypt_modern|5|1 -community|acva:InfluenceFromAncientEgypt|5|1 -community|acva:InfluenceFromByzantium|5|1 -community|acva:InfluenceFromChina|5|1 -community|acva:InfluenceFromGreece|5|1 -community|acva:InfluenceFromIslam|5|1 -community|acva:InfluenceFromPersia|5|1 -community|acva:InfluenceFromRome|5|1 -community|acva:Iraq|5|1 -community|acva:Islam_Education|5|1 -community|acva:Islam_branches_and_schools|5|1 -community|acva:Islamic_law_system|5|1 -community|acva:Jordan|5|1 -community|acva:Kuwait|5|1 -community|acva:Lebanon|5|1 -community|acva:Libya|5|1 -community|acva:Mauritania|5|1 -community|acva:Mesopotamia_civilization|5|1 -community|acva:Morocco|5|1 -community|acva:Oman|5|1 -community|acva:Palestine|5|1 -community|acva:Qatar|5|1 -community|acva:Saudi_Arabia|5|1 -community|acva:Somalia|5|1 -community|acva:Sudan|5|1 -community|acva:Syria|5|1 -community|acva:Tunisia|5|1 -community|acva:United_Arab_Emirates|5|1 -community|acva:Yemen|5|1 -community|acva:communication|5|1 -community|acva:computer_and_phone|5|1 -community|acva:daily_life|5|1 -community|acva:entertainment|5|1 -community|alghafa:mcq_exams_test_ar|5|1 -community|alghafa:meta_ar_dialects|5|1 -community|alghafa:meta_ar_msa|5|1 -community|alghafa:multiple_choice_facts_truefalse_balanced_task|5|1 -community|alghafa:multiple_choice_grounded_statement_soqal_task|5|1 -community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|5|1 -community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|5|1 -community|alghafa:multiple_choice_rating_sentiment_task|5|1 -community|alghafa:multiple_choice_sentiment_task|5|1 -community|race_ar|5|1 -community|piqa_ar|5|1 -community|arc_easy_ar|5|1 -community|arc_challenge_okapi_ar|5|1 -community|mmlu_okapi_ar|5|1 -community|openbook_qa_ext_ar|5|1 -community|boolq_ar|5|1 -community|copa_ext_ar|5|1 -community|hellaswag_okapi_ar|5|1 -community|toxigen_ar|5|1 -community|sciq_ar|5|1 +community|arabic_exams|0|0 +community|arabic_mmlu_mt:abstract_algebra|0|0 +community|arabic_mmlu_mt:anatomy|0|0 +community|arabic_mmlu_mt:astronomy|0|0 +community|arabic_mmlu_mt:business_ethics|0|0 +community|arabic_mmlu_mt:clinical_knowledge|0|0 +community|arabic_mmlu_mt:college_biology|0|0 +community|arabic_mmlu_mt:college_chemistry|0|0 +community|arabic_mmlu_mt:college_computer_science|0|0 +community|arabic_mmlu_mt:college_mathematics|0|0 +community|arabic_mmlu_mt:college_medicine|0|0 +community|arabic_mmlu_mt:college_physics|0|0 +community|arabic_mmlu_mt:computer_security|0|0 +community|arabic_mmlu_mt:conceptual_physics|0|0 +community|arabic_mmlu_mt:econometrics|0|0 +community|arabic_mmlu_mt:electrical_engineering|0|0 +community|arabic_mmlu_mt:elementary_mathematics|0|0 +community|arabic_mmlu_mt:formal_logic|0|0 +community|arabic_mmlu_mt:global_facts|0|0 +community|arabic_mmlu_mt:high_school_biology|0|0 +community|arabic_mmlu_mt:high_school_chemistry|0|0 +community|arabic_mmlu_mt:high_school_computer_science|0|0 +community|arabic_mmlu_mt:high_school_european_history|0|0 +community|arabic_mmlu_mt:high_school_geography|0|0 +community|arabic_mmlu_mt:high_school_government_and_politics|0|0 +community|arabic_mmlu_mt:high_school_macroeconomics|0|0 +community|arabic_mmlu_mt:high_school_mathematics|0|0 +community|arabic_mmlu_mt:high_school_microeconomics|0|0 +community|arabic_mmlu_mt:high_school_physics|0|0 +community|arabic_mmlu_mt:high_school_psychology|0|0 +community|arabic_mmlu_mt:high_school_statistics|0|0 +community|arabic_mmlu_mt:high_school_us_history|0|0 +community|arabic_mmlu_mt:high_school_world_history|0|0 +community|arabic_mmlu_mt:human_aging|0|0 +community|arabic_mmlu_mt:human_sexuality|0|0 +community|arabic_mmlu_mt:international_law|0|0 +community|arabic_mmlu_mt:jurisprudence|0|0 +community|arabic_mmlu_mt:logical_fallacies|0|0 +community|arabic_mmlu_mt:machine_learning|0|0 +community|arabic_mmlu_mt:management|0|0 +community|arabic_mmlu_mt:marketing|0|0 +community|arabic_mmlu_mt:medical_genetics|0|0 +community|arabic_mmlu_mt:miscellaneous|0|0 +community|arabic_mmlu_mt:moral_disputes|0|0 +community|arabic_mmlu_mt:moral_scenarios|0|0 +community|arabic_mmlu_mt:nutrition|0|0 +community|arabic_mmlu_mt:philosophy|0|0 +community|arabic_mmlu_mt:prehistory|0|0 +community|arabic_mmlu_mt:professional_accounting|0|0 +community|arabic_mmlu_mt:professional_law|0|0 +community|arabic_mmlu_mt:professional_medicine|0|0 +community|arabic_mmlu_mt:professional_psychology|0|0 +community|arabic_mmlu_mt:public_relations|0|0 +community|arabic_mmlu_mt:security_studies|0|0 +community|arabic_mmlu_mt:sociology|0|0 +community|arabic_mmlu_mt:us_foreign_policy|0|0 +community|arabic_mmlu_mt:virology|0|0 +community|arabic_mmlu_mt:world_religions|0|0 +community|acva:Algeria|0|0 +community|acva:Ancient_Egypt|0|0 +community|acva:Arab_Empire|0|0 +community|acva:Arabic_Architecture|0|0 +community|acva:Arabic_Art|0|0 +community|acva:Arabic_Astronomy|0|0 +community|acva:Arabic_Calligraphy|0|0 +community|acva:Arabic_Ceremony|0|0 +community|acva:Arabic_Clothing|0|0 +community|acva:Arabic_Culture|0|0 +community|acva:Arabic_Food|0|0 +community|acva:Arabic_Funeral|0|0 +community|acva:Arabic_Geography|0|0 +community|acva:Arabic_History|0|0 +community|acva:Arabic_Language_Origin|0|0 +community|acva:Arabic_Literature|0|0 +community|acva:Arabic_Math|0|0 +community|acva:Arabic_Medicine|0|0 +community|acva:Arabic_Music|0|0 +community|acva:Arabic_Ornament|0|0 +community|acva:Arabic_Philosophy|0|0 +community|acva:Arabic_Physics_and_Chemistry|0|0 +community|acva:Arabic_Wedding|0|0 +community|acva:Bahrain|0|0 +community|acva:Comoros|0|0 +community|acva:Egypt_modern|0|0 +community|acva:InfluenceFromAncientEgypt|0|0 +community|acva:InfluenceFromByzantium|0|0 +community|acva:InfluenceFromChina|0|0 +community|acva:InfluenceFromGreece|0|0 +community|acva:InfluenceFromIslam|0|0 +community|acva:InfluenceFromPersia|0|0 +community|acva:InfluenceFromRome|0|0 +community|acva:Iraq|0|0 +community|acva:Islam_Education|0|0 +community|acva:Islam_branches_and_schools|0|0 +community|acva:Islamic_law_system|0|0 +community|acva:Jordan|0|0 +community|acva:Kuwait|0|0 +community|acva:Lebanon|0|0 +community|acva:Libya|0|0 +community|acva:Mauritania|0|0 +community|acva:Mesopotamia_civilization|0|0 +community|acva:Morocco|0|0 +community|acva:Oman|0|0 +community|acva:Palestine|0|0 +community|acva:Qatar|0|0 +community|acva:Saudi_Arabia|0|0 +community|acva:Somalia|0|0 +community|acva:Sudan|0|0 +community|acva:Syria|0|0 +community|acva:Tunisia|0|0 +community|acva:United_Arab_Emirates|0|0 +community|acva:Yemen|0|0 +community|acva:communication|0|0 +community|acva:computer_and_phone|0|0 +community|acva:daily_life|0|0 +community|acva:entertainment|0|0 +community|alghafa:mcq_exams_test_ar|0|0 +community|alghafa:meta_ar_dialects|0|0 +community|alghafa:meta_ar_msa|0|0 +community|alghafa:multiple_choice_facts_truefalse_balanced_task|0|0 +community|alghafa:multiple_choice_grounded_statement_soqal_task|0|0 +community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0|0 +community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0|0 +community|alghafa:multiple_choice_rating_sentiment_task|0|0 +community|alghafa:multiple_choice_sentiment_task|0|0 +community|race_ar|0|0 +community|piqa_ar|0|0 +community|arc_easy_ar|0|0 +community|arc_challenge_okapi_ar|0|0 +community|mmlu_okapi_ar|0|0 +community|openbook_qa_ext_ar|0|0 +community|boolq_ar|0|0 +community|copa_ext_ar|0|0 +community|hellaswag_okapi_ar|0|0 +community|toxigen_ar|0|0 +community|sciq_ar|0|0 +community|arabic_mmlu_ht:abstract_algebra|0|0 +community|arabic_mmlu_ht:anatomy|0|0 +community|arabic_mmlu_ht:astronomy|0|0 +community|arabic_mmlu_ht:business_ethics|0|0 +community|arabic_mmlu_ht:clinical_knowledge|0|0 +community|arabic_mmlu_ht:college_biology|0|0 +community|arabic_mmlu_ht:college_chemistry|0|0 +community|arabic_mmlu_ht:college_computer_science|0|0 +community|arabic_mmlu_ht:college_mathematics|0|0 +community|arabic_mmlu_ht:college_medicine|0|0 +community|arabic_mmlu_ht:college_physics|0|0 +community|arabic_mmlu_ht:computer_security|0|0 +community|arabic_mmlu_ht:conceptual_physics|0|0 +community|arabic_mmlu_ht:econometrics|0|0 +community|arabic_mmlu_ht:electrical_engineering|0|0 +community|arabic_mmlu_ht:elementary_mathematics|0|0 +community|arabic_mmlu_ht:formal_logic|0|0 +community|arabic_mmlu_ht:global_facts|0|0 +community|arabic_mmlu_ht:high_school_biology|0|0 +community|arabic_mmlu_ht:high_school_chemistry|0|0 +community|arabic_mmlu_ht:high_school_computer_science|0|0 +community|arabic_mmlu_ht:high_school_european_history|0|0 +community|arabic_mmlu_ht:high_school_geography|0|0 +community|arabic_mmlu_ht:high_school_government_and_politics|0|0 +community|arabic_mmlu_ht:high_school_macroeconomics|0|0 +community|arabic_mmlu_ht:high_school_mathematics|0|0 +community|arabic_mmlu_ht:high_school_microeconomics|0|0 +community|arabic_mmlu_ht:high_school_physics|0|0 +community|arabic_mmlu_ht:high_school_psychology|0|0 +community|arabic_mmlu_ht:high_school_statistics|0|0 +community|arabic_mmlu_ht:high_school_us_history|0|0 +community|arabic_mmlu_ht:high_school_world_history|0|0 +community|arabic_mmlu_ht:human_aging|0|0 +community|arabic_mmlu_ht:human_sexuality|0|0 +community|arabic_mmlu_ht:international_law|0|0 +community|arabic_mmlu_ht:jurisprudence|0|0 +community|arabic_mmlu_ht:logical_fallacies|0|0 +community|arabic_mmlu_ht:machine_learning|0|0 +community|arabic_mmlu_ht:management|0|0 +community|arabic_mmlu_ht:marketing|0|0 +community|arabic_mmlu_ht:medical_genetics|0|0 +community|arabic_mmlu_ht:miscellaneous|0|0 +community|arabic_mmlu_ht:moral_disputes|0|0 +community|arabic_mmlu_ht:moral_scenarios|0|0 +community|arabic_mmlu_ht:nutrition|0|0 +community|arabic_mmlu_ht:philosophy|0|0 +community|arabic_mmlu_ht:prehistory|0|0 +community|arabic_mmlu_ht:professional_accounting|0|0 +community|arabic_mmlu_ht:professional_law|0|0 +community|arabic_mmlu_ht:professional_medicine|0|0 +community|arabic_mmlu_ht:professional_psychology|0|0 +community|arabic_mmlu_ht:public_relations|0|0 +community|arabic_mmlu_ht:security_studies|0|0 +community|arabic_mmlu_ht:sociology|0|0 +community|arabic_mmlu_ht:us_foreign_policy|0|0 +community|arabic_mmlu_ht:virology|0|0 +community|arabic_mmlu_ht:world_religions|0|0 +community|arabic_mmlu:Islamic Studies|0|0 +community|arabic_mmlu:Islamic Studies (Middle School)|0|0 +community|arabic_mmlu:Islamic Studies (Primary School)|0|0 +community|arabic_mmlu:Islamic Studies (High School)|0|0 +community|arabic_mmlu:Driving Test|0|0 +community|arabic_mmlu:Natural Science (Middle School)|0|0 +community|arabic_mmlu:Natural Science (Primary School)|0|0 +community|arabic_mmlu:History (Middle School)|0|0 +community|arabic_mmlu:History (Primary School)|0|0 +community|arabic_mmlu:History (High School)|0|0 +community|arabic_mmlu:General Knowledge|0|0 +community|arabic_mmlu:General Knowledge (Middle School)|0|0 +community|arabic_mmlu:General Knowledge (Primary School)|0|0 +community|arabic_mmlu:Law (Professional)|0|0 +community|arabic_mmlu:Physics (High School)|0|0 +community|arabic_mmlu:Social Science (Middle School)|0|0 +community|arabic_mmlu:Social Science (Primary School)|0|0 +community|arabic_mmlu:Management (University)|0|0 +community|arabic_mmlu:Arabic Language (Middle School)|0|0 +community|arabic_mmlu:Arabic Language (Primary School)|0|0 +community|arabic_mmlu:Arabic Language (High School)|0|0 +community|arabic_mmlu:Political Science (University)|0|0 +community|arabic_mmlu:Philosophy (High School)|0|0 +community|arabic_mmlu:Accounting (University)|0|0 +community|arabic_mmlu:Computer Science (Middle School)|0|0 +community|arabic_mmlu:Computer Science (Primary School)|0|0 +community|arabic_mmlu:Computer Science (High School)|0|0 +community|arabic_mmlu:Computer Science (University)|0|0 +community|arabic_mmlu:Geography (Middle School)|0|0 +community|arabic_mmlu:Geography (Primary School)|0|0 +community|arabic_mmlu:Geography (High School)|0|0 +community|arabic_mmlu:Math (Primary School)|0|0 +community|arabic_mmlu:Biology (High School)|0|0 +community|arabic_mmlu:Economics (Middle School)|0|0 +community|arabic_mmlu:Economics (High School)|0|0 +community|arabic_mmlu:Economics (University)|0|0 +community|arabic_mmlu:Arabic Language (General)|0|0 +community|arabic_mmlu:Arabic Language (Grammar)|0|0 +community|arabic_mmlu:Civics (Middle School)|0|0 +community|arabic_mmlu:Civics (High School)|0|0 +community|madinah_qa:Arabic Language (General)|0|0 +community|madinah_qa:Arabic Language (Grammar)|0|0 +community|aratrust:Trustfulness|0|0 +community|aratrust:MentalHealth|0|0 +community|aratrust:PhysicalHealth|0|0 +community|aratrust:Offensive|0|0 +community|aratrust:Ethics|0|0 +community|aratrust:Privacy|0|0 +community|aratrust:Unfairness|0|0 +community|aratrust:Illegal|0|0 diff --git a/src/lighteval/tasks/multilingual/tasks.py b/src/lighteval/tasks/multilingual/tasks.py index 93d8cea40..3d92a71e2 100644 --- a/src/lighteval/tasks/multilingual/tasks.py +++ b/src/lighteval/tasks/multilingual/tasks.py @@ -2117,6 +2117,7 @@ ] ] + TURKISH_MMLU_SUBSET = [ "Biology", "Chemistry",