Skip to content

Commit

Permalink
Fix formatting and linting issues via pre-commit hooks
Browse files Browse the repository at this point in the history
  • Loading branch information
Manel-Hik committed Dec 25, 2024
1 parent f711450 commit b9c5710
Showing 1 changed file with 35 additions and 34 deletions.
69 changes: 35 additions & 34 deletions community_tasks/arabic_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,15 @@
"""
import random
import re
from typing import Dict, List, Optional

from lighteval.metrics.metrics import Metrics
from lighteval.metrics.llm_as_judge import JudgeLM
from lighteval.metrics.metrics import Metric, MetricCategory, Metrics # Import MetricCategory and Metric
from lighteval.metrics.utils.metric_utils import MetricUseCase
from lighteval.tasks.default_prompts import LETTER_INDICES
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc

from typing import List, Dict, Optional
from lighteval.metrics.llm_as_judge import JudgeLM
from lighteval.metrics.metrics import MetricCategory, Metric # Import MetricCategory and Metric
from lighteval.metrics.utils.metric_utils import MetricUseCase



# fmt: off
LETTER_INDICES_AR = ["أ", "ب", "ج", "د", "هـ", "و", "ز", "ح", "ط", "ي", "ك", "ل", "م", "ن", "س", "ع", "ف", "ص", "ق", "ر", "ش", "ت", "ث", "خ", "ذ", "ض", "ظ", "غ"]
Expand Down Expand Up @@ -845,17 +842,17 @@ def __init__(self, judge: JudgeLM):
self.category = MetricCategory.LLM_AS_JUDGE # Add the category attribute
self.corpus_level_fn = self.aggregate_scores # Define the corpus level function
self.sample_level_fn = self._sample_level_fn
self.higher_is_better= True,
self.higher_is_better = (True,)
self.use_case = MetricUseCase.NONE

def compute(self, responses: list[str], formatted_docs: list[Doc], **kwargs) -> dict[str, float]:
"""
Compute the score using the judge's evaluate_answer method.
Args:
predictions (list[str]): The predicted answers.
formatted_docs (list[Doc]): The formatted documents containing questions and gold answers.
Returns:
dict[str, float]: A dictionary containing the evaluation scores.
"""
Expand All @@ -879,25 +876,26 @@ def aggregate_scores(self, scores: list[dict]) -> float:
def _sample_level_fn(self):
return None


def qa_prompt_arabic(line: Dict, task_name: str = None) -> Doc:
"""Format the prompt for question answering with candidates"""

# Check the input line structure

question = str(line["question"])

# Convert candidates to string if it isn't already
if isinstance(line["candidates"], list):
candidates = [str(c) for c in line["candidates"]]
else:
candidates = str(line["candidates"]).split('\n')
candidates = str(line["candidates"]).split("\n")

# Clean up candidates
candidates = [c.strip() for c in candidates if c.strip()]

instruction = "بناءً على السياقات المقترحة التالية، اجب عن السؤال التالي"
query = f"{instruction}\n\nالسؤال:\n{question}\n\nالسياقات المقترحة:\n{', '.join(candidates)}\n"

# Ensure gold_answer is a string
gold_answer = str(line.get("gold_answer", "")) # Ensure this is set correctly

Expand All @@ -907,25 +905,26 @@ def qa_prompt_arabic(line: Dict, task_name: str = None) -> Doc:
query=query,
instruction=instruction,
choices=[gold_answer], # Ensure this is populated correctly
gold_index= 0
gold_index=0,
)

return doc



def judge_template(question: str, answer: str, gold: str, options: Optional[List[str]] = None) -> List[Dict[str, str]]:
"""Template for the judge prompt in Arabic"""
messages = [
{
"role": "system",
"role": "system",
"content": """أنت مقيّم محايد خبير. مهمتك هي:
1. تقييم دقة الإجابة مقارنة بالإجابة الصحيحة
2. التحقق من أن الإجابة مدعومة بالسياق المقدم
3. تقييم جودة وشمولية الإجابة
قم بتقييم الإجابة على مقياس من 0 إلى 10."""
قم بتقييم الإجابة على مقياس من 0 إلى 10.""",
},
{
"role": "user",
"role": "user",
"content": f"""{question}
الإجابة المقدمة: {answer}
Expand All @@ -939,33 +938,37 @@ def judge_template(question: str, answer: str, gold: str, options: Optional[List
- 7-8: إجابة جيدة مع بعض النقص
- 9-10: إجابة ممتازة ودقيقة
قدم تقييمك كرقم فقط."""
}
قدم تقييمك كرقم فقط.""",
},
]
return messages


def process_judge_response(response) -> float:
"""Process the judge's response to extract the score"""
# If response is a list, extract the content from the user role
if isinstance(response, list):
# Join the content from the user role into a single string
response_content = ' '.join(item['content'] for item in response if item['role'] == 'user')
response_content = " ".join(item["content"] for item in response if item["role"] == "user")
else:
response_content = response # If it's not a list, use it directly

try:
# Extract the score from the response content
score = float(next(num for num in response_content.split() if num.replace('.', '', 1).isdigit()))
score = float(next(num for num in response_content.split() if num.replace(".", "", 1).isdigit()))
return min(max(score / 10.0, 0.0), 1.0)
except (StopIteration, ValueError):
return 0.0


# Initialize the judge metric


judge = JudgeLM(
model="Qwen/Qwen2.5-7B-Instruct",
model="Qwen/Qwen2.5-7B-Instruct",
templates=judge_template,
process_judge_response=process_judge_response,
judge_backend="vllm"
judge_backend="vllm",
)

# Wrap the judge in the new wrapper class
Expand All @@ -979,17 +982,15 @@ def process_judge_response(response) -> float:
hf_repo="OALL/ALRAGE",
hf_subset=None,
hf_avail_splits=["train"], # Only the train split is available
evaluation_splits=["train"],
metric=[wrapped_judge],
evaluation_splits=["train"],
metric=[wrapped_judge],
trust_dataset=True,
generation_size=200, ## updated
stop_sequence=[], ## updated
version=0
generation_size=200, # updated
stop_sequence=[], # updated
version=0,
)




TASKS_TABLE = (
ARABIC_MMLU_TASKS
+ ARABIC_MMLU_HT_TASKS
Expand All @@ -1010,5 +1011,5 @@ def process_judge_response(response) -> float:
+ [hellaswag_okapi_ar_task]
+ [toxigen_ar_task]
+ [sciq_ar_task]
+[alrage_qa_task]
+ [alrage_qa_task]
)

0 comments on commit b9c5710

Please sign in to comment.