-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluation.py
87 lines (68 loc) · 3.42 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pandas as pd
import re
df = pd.read_excel("all_QA.xlsx")
def extract_corresponding_value(question, correct_answer):
# Find the option corresponding to the correct answer letter
match = re.search(r'\({}\)\s*(.*?)(?:\s*\(|$)'.format(correct_answer), question)
if match:
extracted_value = match.group(1).strip()
if correct_answer == 'D':
extracted_value = extracted_value[:-2] # Exclude the last two characters
return extracted_value
return None
# Function to check if a prediction is correct for a given row
def is_correct_prediction(row):
ground_truth = str(row['Correct Answer'])
# Change the below line with the model name that you are interested in evaluating
predicted = row['LLAMA3-8B']
question_type = row['Question Type']
# Select NUM category
if question_type == 'NUM':
if 'to' in ground_truth or ':' in ground_truth:
try:
range_numbers = re.findall(r'\d+\.\d+|\d+', ground_truth)
range_start, range_end = map(float, [range_numbers[0], range_numbers[-1]])
if range_start <= float(predicted) <= range_end:
return True
except (ValueError, IndexError):
pass
elif str(predicted) == ground_truth:
return True
# For MATCH category
else:
# First, check if the predicted answer matches the ground truth
if str(predicted) == ground_truth:
return True
# If not, check if the ground truth matches the corresponding answer
corresponding_answer = row['Corresponding Value']
if corresponding_answer and predicted == corresponding_answer:
return True
# For other question types, check for exact match
return str(predicted) == ground_truth
def calculate_accuracy(df):
results_by_category = df.apply(is_correct_prediction, axis=1).groupby([df['Question Type'], df['TOPIC']]).agg(['sum', 'count'])
accuracies = {}
total_correct_predictions = 0
total_questions = 0
print("\nResults by category (Correct Predictions / Total Questions):")
for category, result in results_by_category.iterrows():
correct_predictions = result['sum']
total = result['count']
accuracy = correct_predictions / total
accuracies[category] = accuracy
total_correct_predictions += correct_predictions
total_questions += total
print(f"{category}: {correct_predictions} / {total} (Accuracy: {accuracy:.2%})")
# Calculate accuracy for specific question types (NUM, MATCH, etc.)
for question_type in df['Question Type'].unique():
if question_type not in accuracies:
accuracies[question_type] = 0
questions_subset = df[df['Question Type'] == question_type]
correct_predictions_subset = questions_subset.apply(is_correct_prediction, axis=1).sum()
total_questions_subset = len(questions_subset)
accuracy_subset = correct_predictions_subset / total_questions_subset
print(f"{question_type} Accuracy: {correct_predictions_subset} / {total_questions_subset} (Accuracy: {accuracy_subset:.2%})")
overall_accuracy = total_correct_predictions / total_questions
print(f"\nOverall Accuracy: {total_correct_predictions} / {total_questions} (Accuracy: {overall_accuracy:.2%})")
return accuracies, overall_accuracy
category_accuracies, overall_accuracy = calculate_accuracy(df)