Skip to content

Commit

Permalink
Merge branch 'feature/support-official-paperbench' of https://github.…
Browse files Browse the repository at this point in the history
…com/ulab-uiuc/research-town into feature/support-official-paperbench
  • Loading branch information
lwaekfjlk committed Dec 2, 2024
2 parents cbcba1c + 7832553 commit ea3ec07
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 23 deletions.
6 changes: 0 additions & 6 deletions research_bench/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,6 @@ def compute_bertscore_per_question(reference: str, hypothesis: str) -> List[floa
def compute_proposal_metrics(reference: str, generation: str) -> Dict[str, float]:
bleu = compute_bleu(reference, generation)
rouge_l = compute_rouge_l(reference, generation)
bert_score_per_question = compute_bertscore_per_question(reference, generation)
openai_sim_per_question = compute_openai_embedding_similarity_per_question(
reference, generation
)
Expand All @@ -259,11 +258,6 @@ def compute_proposal_metrics(reference: str, generation: str) -> Dict[str, float
return {
'bleu': bleu,
'rouge_l': rouge_l,
'bert_score_q1': bert_score_per_question[0],
'bert_score_q2': bert_score_per_question[1],
'bert_score_q3': bert_score_per_question[2],
'bert_score_q4': bert_score_per_question[3],
'bert_score_q5': bert_score_per_question[4],
'openai_sim_q1': openai_sim_per_question[0],
'openai_sim_q2': openai_sim_per_question[1],
'openai_sim_q3': openai_sim_per_question[2],
Expand Down
19 changes: 4 additions & 15 deletions research_bench/eval_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ def compute_weighted_metric(metrics):
weights = [0.2] * 5
openai_metric = np.dot(weights, [np.mean(metrics[f'openai_sim_q{i}']) for i in range(1, 6)])
voyageai_metric = np.dot(weights, [np.mean(metrics[f'voyageai_sim_q{i}']) for i in range(1, 6)])
bertscore_metric = np.dot(weights, [np.mean(metrics[f'bertscore_q{i}']) for i in range(1, 6)])
bleu = np.dot(weights, [np.mean(metrics[f'bleu']) for i in range(1, 6)])
rouge_l = np.dot(weights, [np.mean(metrics[f'rouge_l']) for i in range(1, 6)])
bert_score = np.dot(weights, [np.mean(metrics[f'bert_score']) for i in range(1, 6)])
return openai_metric, voyageai_metric, bleu, rouge_l, bert_score
return openai_metric, voyageai_metric, bertscore_metric, bleu, rouge_l

def plot_sorted_metrics(metric1, metric2):
sorted_indices = np.argsort(metric2)
Expand Down Expand Up @@ -87,11 +87,10 @@ def plot_sorted_metrics(metric1, metric2):

metrics_file1 = convert_aligned_to_metrics(aligned_metrics_file1)
metrics_file2 = convert_aligned_to_metrics(aligned_metrics_file2)
import pdb; pdb.set_trace()

print("Computing weighted metrics...")
metric1_openai, metric1_voyageai, metric1_bleu, metric1_rougel, metric1_bertscore = compute_weighted_metric(metrics_file1)
metric2_openai, metric2_voyageai, metric2_bleu, metric2_rougel, metric2_bertscore = compute_weighted_metric(metrics_file2)
metric1_openai, metric1_voyageai, metric1_bertscore, metric1_bleu, metric1_rougel = compute_weighted_metric(metrics_file1)
metric2_openai, metric2_voyageai, metric2_bertscore, metric2_bleu, metric2_rougel = compute_weighted_metric(metrics_file2)

print(f"File 1 - OpenAI metric: {metric1_openai}, VoyageAI metric: {metric1_voyageai}")
print(f"File 2 - OpenAI metric: {metric2_openai}, VoyageAI metric: {metric2_voyageai}")
Expand All @@ -116,14 +115,4 @@ def plot_sorted_metrics(metric1, metric2):
print(f"average score for q{i} in file2: {np.mean([metrics_file2[f'openai_sim_q{i}'][j] for j in range(len(shared_ids))])}")
print(f"Paired t-test for q{i}: t-statistic = {t_stat}, p-value = {p_value}")


openai_avg_metric = np.dot([0.2] * 5, [np.mean(metrics_file1[f'openai_sim_q{i}']) for i in range(1, 6)])
voyageai_avg_metric = np.dot([0.2] * 5, [np.mean(metrics_file1[f'voyageai_sim_q{i}']) for i in range(1, 6)])
print(f"File 1 - OpenAI metric: {openai_avg_metric}, VoyageAI metric: {voyageai_avg_metric}")

openai_avg_metric = np.dot([0.2] * 5, [np.mean(metrics_file2[f'openai_sim_q{i}']) for i in range(1, 6)])
voyageai_avg_metric = np.dot([0.2] * 5, [np.mean(metrics_file2[f'voyageai_sim_q{i}']) for i in range(1, 6)])
print(f"File 2 - OpenAI metric: {openai_avg_metric}, VoyageAI metric: {voyageai_avg_metric}")


plot_sorted_metrics(metrics_file1['openai_sim_q5'], metrics_file2['openai_sim_q5'])
4 changes: 2 additions & 2 deletions research_bench/run_eval.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
#!/bin/bash

# Define the input and output paths, along with the modes to test
INPUT_PATH="./paper_bench/paper_bench_easy_500.json"
INPUT_PATH="./paper_bench/paper_bench_hard_500.json"
OUTPUT_DIR="./results"
MODES=("zero_shot")
NUM_PROCESSES=4

# Loop through each mode and run the evaluation
for MODE in "${MODES[@]}"
do
OUTPUT_PATH="${OUTPUT_DIR}/paper_bench_easy_500_result_4o_mini_${MODE}.jsonl"
OUTPUT_PATH="${OUTPUT_DIR}/paper_bench_hard_500_result_4o_mini_${MODE}.jsonl"
echo "Running evaluation for mode: $MODE"
poetry run python run_eval.py --input "$INPUT_PATH" --output "$OUTPUT_PATH" --mode "$MODE" --num_processes "$NUM_PROCESSES"
echo "Finished evaluation for mode: $MODE"
Expand Down

0 comments on commit ea3ec07

Please sign in to comment.