Fix formatting and linting issues via pre-commit hooks

huggingface · Dec 25, 2024 · b9c5710 · b9c5710
1 parent f711450
commit b9c5710
Showing 1 changed file with 35 additions and 34 deletions.
diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
@@ -28,18 +28,15 @@
 """
 import random
 import re
+from typing import Dict, List, Optional
 
-from lighteval.metrics.metrics import Metrics
+from lighteval.metrics.llm_as_judge import JudgeLM
+from lighteval.metrics.metrics import Metric, MetricCategory, Metrics  # Import MetricCategory and Metric
+from lighteval.metrics.utils.metric_utils import MetricUseCase
 from lighteval.tasks.default_prompts import LETTER_INDICES
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 
-from typing import List, Dict, Optional
-from lighteval.metrics.llm_as_judge import JudgeLM
-from lighteval.metrics.metrics import MetricCategory, Metric  # Import MetricCategory and Metric
-from lighteval.metrics.utils.metric_utils import MetricUseCase
-
-
 
 # fmt: off
 LETTER_INDICES_AR = ["أ", "ب", "ج", "د", "هـ", "و", "ز", "ح", "ط", "ي", "ك", "ل", "م", "ن", "س", "ع", "ف", "ص", "ق", "ر", "ش", "ت", "ث", "خ", "ذ", "ض", "ظ", "غ"]
@@ -845,17 +842,17 @@ def __init__(self, judge: JudgeLM):
         self.category = MetricCategory.LLM_AS_JUDGE  # Add the category attribute
         self.corpus_level_fn = self.aggregate_scores  # Define the corpus level function
         self.sample_level_fn = self._sample_level_fn
-        self.higher_is_better= True,
+        self.higher_is_better = (True,)
         self.use_case = MetricUseCase.NONE
 
     def compute(self, responses: list[str], formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
         """
         Compute the score using the judge's evaluate_answer method.
-        
+
         Args:
             predictions (list[str]): The predicted answers.
             formatted_docs (list[Doc]): The formatted documents containing questions and gold answers.
-        
+
         Returns:
             dict[str, float]: A dictionary containing the evaluation scores.
         """
@@ -879,25 +876,26 @@ def aggregate_scores(self, scores: list[dict]) -> float:
     def _sample_level_fn(self):
         return None
 
+
 def qa_prompt_arabic(line: Dict, task_name: str = None) -> Doc:
     """Format the prompt for question answering with candidates"""
-    
+
     # Check the input line structure
 
     question = str(line["question"])
-    
+
     # Convert candidates to string if it isn't already
     if isinstance(line["candidates"], list):
         candidates = [str(c) for c in line["candidates"]]
     else:
-        candidates = str(line["candidates"]).split('\n')
-    
+        candidates = str(line["candidates"]).split("\n")
+
     # Clean up candidates
     candidates = [c.strip() for c in candidates if c.strip()]
 
     instruction = "بناءً على السياقات المقترحة التالية، اجب عن السؤال التالي"
     query = f"{instruction}\n\nالسؤال:\n{question}\n\nالسياقات المقترحة:\n{', '.join(candidates)}\n"
-    
+
     # Ensure gold_answer is a string
     gold_answer = str(line.get("gold_answer", ""))  # Ensure this is set correctly
 
@@ -907,25 +905,26 @@ def qa_prompt_arabic(line: Dict, task_name: str = None) -> Doc:
         query=query,
         instruction=instruction,
         choices=[gold_answer],  # Ensure this is populated correctly
-        gold_index= 0
+        gold_index=0,
     )
 
     return doc
-
+
+
 def judge_template(question: str, answer: str, gold: str, options: Optional[List[str]] = None) -> List[Dict[str, str]]:
     """Template for the judge prompt in Arabic"""
     messages = [
         {
-            "role": "system", 
+            "role": "system",
             "content": """أنت مقيّم محايد خبير. مهمتك هي:
 1. تقييم دقة الإجابة مقارنة بالإجابة الصحيحة
 2. التحقق من أن الإجابة مدعومة بالسياق المقدم
 3. تقييم جودة وشمولية الإجابة
 
-قم بتقييم الإجابة على مقياس من 0 إلى 10."""
+قم بتقييم الإجابة على مقياس من 0 إلى 10.""",
         },
         {
-            "role": "user", 
+            "role": "user",
             "content": f"""{question}
 
 الإجابة المقدمة: {answer}
@@ -939,33 +938,37 @@ def judge_template(question: str, answer: str, gold: str, options: Optional[List
 - 7-8: إجابة جيدة مع بعض النقص
 - 9-10: إجابة ممتازة ودقيقة
 
-قدم تقييمك كرقم فقط."""
-        }
+قدم تقييمك كرقم فقط.""",
+        },
     ]
     return messages
 
+
 def process_judge_response(response) -> float:
     """Process the judge's response to extract the score"""
     # If response is a list, extract the content from the user role
     if isinstance(response, list):
         # Join the content from the user role into a single string
-        response_content = ' '.join(item['content'] for item in response if item['role'] == 'user')
+        response_content = " ".join(item["content"] for item in response if item["role"] == "user")
     else:
         response_content = response  # If it's not a list, use it directly
 
     try:
         # Extract the score from the response content
-        score = float(next(num for num in response_content.split() if num.replace('.', '', 1).isdigit()))
+        score = float(next(num for num in response_content.split() if num.replace(".", "", 1).isdigit()))
         return min(max(score / 10.0, 0.0), 1.0)
     except (StopIteration, ValueError):
         return 0.0
 
+
 # Initialize the judge metric
+
+
 judge = JudgeLM(
-    model="Qwen/Qwen2.5-7B-Instruct",  
+    model="Qwen/Qwen2.5-7B-Instruct",
     templates=judge_template,
     process_judge_response=process_judge_response,
-    judge_backend="vllm" 
+    judge_backend="vllm",
 )
 
 # Wrap the judge in the new wrapper class
@@ -979,17 +982,15 @@ def process_judge_response(response) -> float:
     hf_repo="OALL/ALRAGE",
     hf_subset=None,
     hf_avail_splits=["train"],  # Only the train split is available
-    evaluation_splits=["train"],  
-    metric=[wrapped_judge],  
+    evaluation_splits=["train"],
+    metric=[wrapped_judge],
     trust_dataset=True,
-    generation_size=200,  ## updated
-    stop_sequence=[],  ## updated
-    version=0
+    generation_size=200,  # updated
+    stop_sequence=[],  # updated
+    version=0,
 )
 
 
-
-
 TASKS_TABLE = (
     ARABIC_MMLU_TASKS
     + ARABIC_MMLU_HT_TASKS
@@ -1010,5 +1011,5 @@ def process_judge_response(response) -> float:
     + [hellaswag_okapi_ar_task]
     + [toxigen_ar_task]
     + [sciq_ar_task]
-     +[alrage_qa_task]
+    + [alrage_qa_task]
 )