Skip to content

Commit

Permalink
update all code
Browse files Browse the repository at this point in the history
  • Loading branch information
lwaekfjlk committed Dec 2, 2024
1 parent dc27e18 commit 37bdf75
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 5 deletions.
9 changes: 8 additions & 1 deletion research_bench/eval_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,13 @@ def plot_sorted_metrics(metric1, metric2):
print(f"Paired t-test for q{i}: t-statistic = {t_stat}, p-value = {p_value}")


print(f"Paired t-test: t-statistic = {t_stat}, p-value = {p_value}")
openai_avg_metric = np.dot([0.2] * 5, [np.mean(metrics_file1[f'openai_sim_q{i}']) for i in range(1, 6)])
voyageai_avg_metric = np.dot([0.2] * 5, [np.mean(metrics_file1[f'voyageai_sim_q{i}']) for i in range(1, 6)])
print(f"File 1 - OpenAI metric: {openai_avg_metric}, VoyageAI metric: {voyageai_avg_metric}")

openai_avg_metric = np.dot([0.2] * 5, [np.mean(metrics_file2[f'openai_sim_q{i}']) for i in range(1, 6)])
voyageai_avg_metric = np.dot([0.2] * 5, [np.mean(metrics_file2[f'voyageai_sim_q{i}']) for i in range(1, 6)])
print(f"File 2 - OpenAI metric: {openai_avg_metric}, VoyageAI metric: {voyageai_avg_metric}")


plot_sorted_metrics(metrics_file1['openai_sim_q5'], metrics_file2['openai_sim_q5'])
8 changes: 4 additions & 4 deletions research_bench/run_eval.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
#!/bin/bash

# Define the input and output paths, along with the modes to test
INPUT_PATH="./paper_bench/paper_bench_mid_500.json"
INPUT_PATH="./paper_bench/paper_bench_hard_500.json"
OUTPUT_DIR="./results"
MODES=("fake_research_town")
NUM_PROCESSES=8
MODES=("citation_only")
NUM_PROCESSES=1

# Loop through each mode and run the evaluation
for MODE in "${MODES[@]}"
do
OUTPUT_PATH="${OUTPUT_DIR}/paper_bench_mid_500_result_4o_mini_${MODE}.jsonl"
OUTPUT_PATH="${OUTPUT_DIR}/paper_bench_hard_500_result_4o_mini_${MODE}.jsonl"
echo "Running evaluation for mode: $MODE"
poetry run python run_eval.py --input "$INPUT_PATH" --output "$OUTPUT_PATH" --mode "$MODE" --num_processes "$NUM_PROCESSES"
echo "Finished evaluation for mode: $MODE"
Expand Down
1 change: 1 addition & 0 deletions research_bench/split_paper_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def main():
paper_ids.remove(data['paper_id'])

filtered_dataset_sorted = sort_dataset_by_similarity(filtered_dataset)

bottom_500 = filtered_dataset_sorted[:500]
top_500 = filtered_dataset_sorted[-500:]
mid_500 = random.sample(filtered_dataset_sorted[500:-500], 500)
Expand Down

0 comments on commit 37bdf75

Please sign in to comment.