diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 47ff0cba..71b8331c 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -28,18 +28,15 @@ """ import random import re +from typing import Dict, List, Optional -from lighteval.metrics.metrics import Metrics +from lighteval.metrics.llm_as_judge import JudgeLM +from lighteval.metrics.metrics import Metric, MetricCategory, Metrics # Import MetricCategory and Metric +from lighteval.metrics.utils.metric_utils import MetricUseCase from lighteval.tasks.default_prompts import LETTER_INDICES from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc -from typing import List, Dict, Optional -from lighteval.metrics.llm_as_judge import JudgeLM -from lighteval.metrics.metrics import MetricCategory, Metric # Import MetricCategory and Metric -from lighteval.metrics.utils.metric_utils import MetricUseCase - - # fmt: off LETTER_INDICES_AR = ["أ", "ب", "ج", "د", "هـ", "و", "ز", "ح", "ط", "ي", "ك", "ل", "م", "ن", "س", "ع", "ف", "ص", "ق", "ر", "ش", "ت", "ث", "خ", "ذ", "ض", "ظ", "غ"] @@ -845,17 +842,17 @@ def __init__(self, judge: JudgeLM): self.category = MetricCategory.LLM_AS_JUDGE # Add the category attribute self.corpus_level_fn = self.aggregate_scores # Define the corpus level function self.sample_level_fn = self._sample_level_fn - self.higher_is_better= True, + self.higher_is_better = (True,) self.use_case = MetricUseCase.NONE def compute(self, responses: list[str], formatted_docs: list[Doc], **kwargs) -> dict[str, float]: """ Compute the score using the judge's evaluate_answer method. - + Args: predictions (list[str]): The predicted answers. formatted_docs (list[Doc]): The formatted documents containing questions and gold answers. - + Returns: dict[str, float]: A dictionary containing the evaluation scores. """ @@ -879,25 +876,26 @@ def aggregate_scores(self, scores: list[dict]) -> float: def _sample_level_fn(self): return None + def qa_prompt_arabic(line: Dict, task_name: str = None) -> Doc: """Format the prompt for question answering with candidates""" - + # Check the input line structure question = str(line["question"]) - + # Convert candidates to string if it isn't already if isinstance(line["candidates"], list): candidates = [str(c) for c in line["candidates"]] else: - candidates = str(line["candidates"]).split('\n') - + candidates = str(line["candidates"]).split("\n") + # Clean up candidates candidates = [c.strip() for c in candidates if c.strip()] instruction = "بناءً على السياقات المقترحة التالية، اجب عن السؤال التالي" query = f"{instruction}\n\nالسؤال:\n{question}\n\nالسياقات المقترحة:\n{', '.join(candidates)}\n" - + # Ensure gold_answer is a string gold_answer = str(line.get("gold_answer", "")) # Ensure this is set correctly @@ -907,25 +905,26 @@ def qa_prompt_arabic(line: Dict, task_name: str = None) -> Doc: query=query, instruction=instruction, choices=[gold_answer], # Ensure this is populated correctly - gold_index= 0 + gold_index=0, ) return doc - + + def judge_template(question: str, answer: str, gold: str, options: Optional[List[str]] = None) -> List[Dict[str, str]]: """Template for the judge prompt in Arabic""" messages = [ { - "role": "system", + "role": "system", "content": """أنت مقيّم محايد خبير. مهمتك هي: 1. تقييم دقة الإجابة مقارنة بالإجابة الصحيحة 2. التحقق من أن الإجابة مدعومة بالسياق المقدم 3. تقييم جودة وشمولية الإجابة -قم بتقييم الإجابة على مقياس من 0 إلى 10.""" +قم بتقييم الإجابة على مقياس من 0 إلى 10.""", }, { - "role": "user", + "role": "user", "content": f"""{question} الإجابة المقدمة: {answer} @@ -939,33 +938,37 @@ def judge_template(question: str, answer: str, gold: str, options: Optional[List - 7-8: إجابة جيدة مع بعض النقص - 9-10: إجابة ممتازة ودقيقة -قدم تقييمك كرقم فقط.""" - } +قدم تقييمك كرقم فقط.""", + }, ] return messages + def process_judge_response(response) -> float: """Process the judge's response to extract the score""" # If response is a list, extract the content from the user role if isinstance(response, list): # Join the content from the user role into a single string - response_content = ' '.join(item['content'] for item in response if item['role'] == 'user') + response_content = " ".join(item["content"] for item in response if item["role"] == "user") else: response_content = response # If it's not a list, use it directly try: # Extract the score from the response content - score = float(next(num for num in response_content.split() if num.replace('.', '', 1).isdigit())) + score = float(next(num for num in response_content.split() if num.replace(".", "", 1).isdigit())) return min(max(score / 10.0, 0.0), 1.0) except (StopIteration, ValueError): return 0.0 + # Initialize the judge metric + + judge = JudgeLM( - model="Qwen/Qwen2.5-7B-Instruct", + model="Qwen/Qwen2.5-7B-Instruct", templates=judge_template, process_judge_response=process_judge_response, - judge_backend="vllm" + judge_backend="vllm", ) # Wrap the judge in the new wrapper class @@ -979,17 +982,15 @@ def process_judge_response(response) -> float: hf_repo="OALL/ALRAGE", hf_subset=None, hf_avail_splits=["train"], # Only the train split is available - evaluation_splits=["train"], - metric=[wrapped_judge], + evaluation_splits=["train"], + metric=[wrapped_judge], trust_dataset=True, - generation_size=200, ## updated - stop_sequence=[], ## updated - version=0 + generation_size=200, # updated + stop_sequence=[], # updated + version=0, ) - - TASKS_TABLE = ( ARABIC_MMLU_TASKS + ARABIC_MMLU_HT_TASKS @@ -1010,5 +1011,5 @@ def process_judge_response(response) -> float: + [hellaswag_okapi_ar_task] + [toxigen_ar_task] + [sciq_ar_task] - +[alrage_qa_task] + + [alrage_qa_task] )