Update notebooks and scripts, clean up the output folder (#28)

* add experiment date to results.xlsx * rename pickles, remove csvs * generate results.xlsx with both Oct 2023 and Jan 2024 experiment * create an archive folder for them * copy the latest results * improve the result uploading notebook: less hardcoding * tmp: add final scores * use the latest results from ai eval spreadsheet to calculate final scores * add report tables in output folder * Update litellm git reference after rebase of the upstreams PR --------- Co-authored-by: Motin <[email protected]>
Gapminder · Feb 5, 2024 · 4d462b0 · 4d462b0
1 parent 51615a5
commit 4d462b0
Show file tree

Hide file tree

Showing 57 changed files with 238 additions and 876 deletions.
diff --git a/automation-api/poetry.lock b/automation-api/poetry.lock
diff --git a/automation-api/yival_experiments/notebooks/final_scores.py b/automation-api/yival_experiments/notebooks/final_scores.py
@@ -19,242 +19,152 @@
 from collections import Counter
 import polars as pl
 import pandas as pd
+from lib.config import read_config
+from lib.pilot.helpers import read_ai_eval_spreadsheet, get_questions, get_model_configs, get_prompt_variants
 
+# load env
+config = read_config()
 
-# read the raw responses
-output_df = pd.read_excel('../output/results.xlsx')
+# load ai eval spreadsheet
+ai_eval_sheet = read_ai_eval_spreadsheet()
 
+results = ai_eval_sheet.evaluation_results.data.df.copy()
 
-# function to check if the model answered correctly considering all responses.
-# it's correct when the most common answer in all responses is correct.
-def is_correct_p(round_results):
-    c = Counter(round_results)
-    top2 = c.most_common(2)
-    if len(top2) == 1:
-        if top2[0][0] == 3:
-            return True
-        else:
-            return False
-    else:
-        if top2[0][1] != top2[1][1] and top2[0][0] == 3:
-            return True
-        else:
-            return False
+# use polars
+results = pl.DataFrame(results)
 
+results.columns
 
-def correctness(lst):
-    c = Counter(lst)
-    top2 = c.most_common(2)
+# rename the prompt_variation_id to match our report
+results.select(pl.col(['prompt_variation_id']).unique())
 
-    if len(top2) > 1 and top2[0][1] == top2[1][1]:
-        return 0
 
-    return top2[0][0]
+prompt_id_mapping = {
+    'instruct_question_options_1': 'prompt1',
+    'instruct_question_options_2': 'prompt3',
+    'no_option_letter': 'prompt2',
+    'zh_no_option_letter': 'prompt2',
+    'zh_instruct_2': 'prompt3',
+    'zh_instruct_1': 'prompt1'
+}
 
-
-# +
-# output_df.columns
-# for g, df in output_df.groupby(['question_id', 'model_id', 'model_params']):
-#     print(g)
-#     print(is_correct_p(df['correctness'].values))
-# -
-
-
-model_correctness = output_df.groupby(["question_id", "model_id", "model_params"])[
-    "correctness"
-].apply(lambda x: correctness(x.values))
-
-
-# let's use polars. The syntax is easier than pandas
-model_correctness = pl.DataFrame(model_correctness.reset_index())
-
-# ## correct rate by model
-
-# TODO: I think it's possible to convert these into a Yival Evaluator.
-# 1. the correct rate for all answers
-out1 = (
-    model_correctness.group_by(["model_id", "model_params"])
-    .agg(
-        pl.col("correctness").filter(pl.col("correctness") == 3).count()
-        / pl.col("correctness").count()
-        * 100
-    )
-    .sort("correctness", descending=True)
-).select(
-    pl.col(['model_id', 'model_params']),
-    pl.col('correctness').alias("correct_rate_with_indecisive")
-)
-
-out1
-
-out1.write_csv('../output/correct_rate_with_indecisive.csv')
-
-# 2. the correct rate when excluding all cases where correctness == 0
-out2 = (
-    model_correctness.filter(pl.col("correctness") != 0)
-    .group_by(["model_id", "model_params"])
-    .agg(
-        pl.col("correctness").filter(pl.col("correctness") == 3).count()
-        / pl.col("correctness").count()
-        * 100
-    )
-    .sort("correctness", descending=True)
-).select(
-    pl.col(['model_id', 'model_params']),
-    pl.col('correctness').alias("correct_rate_without_indecisive")
+results = results.with_columns(
+    pl.col('prompt_variation_id').replace(prompt_id_mapping)
 )
 
-out2
-
-out2.write_csv('../output/correct_rate_without_indecisive.csv')
-
-# 3. the respond rate: (count correctness!=0) / (count total answers)
-out3 = (
-    model_correctness
-    .group_by(["model_id", "model_params"])
-    .agg(
-        pl.col("correctness").filter(pl.col("correctness") != 0).count()
-        / pl.col("correctness").count()
-        * 100
-    )
-).select(
-    pl.col(['model_id', 'model_params']),
-    pl.col('correctness').alias('response_rate')
-).sort("response_rate", descending=True)
+# double check
+results['prompt_variation_id'].unique()
 
-out3
+# create a mapping for model_id -> the actual brand, name and parameters
+model_configs = get_model_configs(ai_eval_sheet, include_all=True)
 
-out3.write_csv('../output/response_rate.csv')
 
-# ## correct rates by prompts and model
+def search_model(model_config_id):
+    for model, model_config in model_configs:
+        if model_config.model_config_id == model_config_id:
+            return ' '.join([
+                model.vendor, model.model_name, model_config.model_parameters])
+    raise ValueError(f'{model_config_id} not found!')
 
-model_correctness_prompt = output_df.groupby(
-    ["question_id", "model_id", "model_params", "prompt_template"]
-)["correctness"].apply(lambda x: correctness(x.values))
 
-model_correctness_prompt = pl.DataFrame(model_correctness_prompt.reset_index())
+model_config_ids = results['model_configuration_id'].unique().to_list()
+model_config_names = [search_model(x) for x in model_config_ids]
+model_config_id_mapping = dict(zip(model_config_ids, model_config_names))
 
-model_correctness_prompt
 
-out1 = (
-    model_correctness_prompt.group_by(["model_id", "model_params", "prompt_template"])
-    .agg(
-        pl.col("correctness").filter(pl.col("correctness") == 3).count()
-        / pl.col("correctness").count()
-        * 100
-    )
-    .sort("correctness", descending=True)
-).select(
-    pl.col(['model_id', 'model_params', 'prompt_template']),
-    pl.col('correctness').alias("correct_rate_with_indecisive")
+# replace nan to indecisive in result
+results = results.with_columns(
+    pl.col('result').replace({'nan': 'indecisive'})
 )
 
-out1
-
-out1.write_csv('../output/correct_rate_with_indecisive_prompt.csv')
-
-
-out2 = (
-    model_correctness_prompt.filter(pl.col("correctness") != 0)
-    .group_by(["model_id", "model_params", "prompt_template"])
-    .agg(
-        pl.col("correctness").filter(pl.col("correctness") == 3).count()
-        / pl.col("correctness").count()
-        * 100
-    )
-    .sort("correctness", descending=True)
+# double check
+results['model_configuration_id'].unique()
+
+
+# Table 1. The number of different answers by model and prompt
+table1 = results.with_columns(
+    pl.concat_list(pl.col([
+        'percent_correct',
+        'percent_wrong',
+        'percent_very_wrong',
+        'percent_eval_failed'])).alias('tmp')
+).with_columns(
+    pl.col('tmp').map_elements(
+        lambda x: len(list(filter(lambda e: e != 0, x)))
+    ).alias('number_of_answers')
 ).select(
-    pl.col(['model_id', 'model_params', 'prompt_template']),
-    pl.col('correctness').alias("correct_rate_without_indecisive")
+    pl.exclude('tmp')
+).group_by(['model_configuration_id', 'prompt_variation_id']).agg(
+    pl.col('number_of_answers').mean()
 )
 
-out2
-
-out2.write_csv('../output/correct_rate_without_indecisive_prompt.csv')
-
-out3 = (
-    model_correctness_prompt
-    .group_by(["model_id", "model_params", 'prompt_template'])
-    .agg(
-        pl.col("correctness").filter(pl.col("correctness") != 0).count()
-        / pl.col("correctness").count()
-        * 100
-    )
-).select(
-    pl.col(['model_id', 'model_params', 'prompt_template']),
-    pl.col('correctness').alias('response_rate')
-).sort("response_rate", descending=True)
-
-out3
-
-out3.write_csv('../output/response_rate_prompt.csv')
-
-# ## correct rates break down by prompt.
-
-prompt_templates = output_df['prompt_template'].unique()
-
-prompt_templates.tolist()
-
-# copy
-prompt_templates_lst = [
-    "Please answer this multiple choices question. If you can't determine the answer please make your best guess:\n\nQuestion:\n{question_text}\nA. {option_a}\nB. {option_b}\nC. {option_c}\n\nAnswer:",
-    "Pick the correct answer: A, B or C. If you can't determine the answer please make your best guess.\n\nQuestion:\n{question_text}\nA. {option_a}\nB. {option_b}\nC. {option_c}\n\nAnswer:",
-    'Question:\n{question_text}\nIs it: {option_a};  {option_b}; or {option_c}? \n\nAnswer:',
-    '请回答以下单选题。如果你不知道哪个答案正确，请回答你认为最接近的答案。\n 问题：\n{question_text}\nA. {option_a}\nB. {option_b}\nC. {option_c}\n 答案：\n',
-    '以下单选题，请你从 A，B，C 中选择一个正确的答案。如果你不知道哪个答案正确，请回答你认为最接近的答案。\n 问题：\n{question_text}\nA. {option_a}\nB. {option_b}\nC. {option_c}\n 答案：\n',
-    '问题：\n{question_text}\n 是 {option_a}，{option_b}，还是 {option_c}？\n 答案：\n'
-]
-
-prompt_id_mappings = dict(
-    zip(prompt_templates_lst, ['prompt1',
-                               'prompt3',
-                               'prompt2',
-                               'prompt1',
-                               'prompt3',
-                               'prompt2'])
+table1 = table1.with_columns(
+    pl.col('model_configuration_id').replace(model_config_id_mapping).alias('model_name')
 )
 
-prompt_id_mappings
-
-output_df['prompt_id'] = output_df['prompt_template'].map(lambda x: prompt_id_mappings[x])
-
-prompt_correctness = output_df.groupby(
-    ["question_id", "prompt_id"]
-)["correctness"].apply(lambda x: correctness(x.values))
-
-
-prompt_correctness = pl.DataFrame(prompt_correctness.reset_index())
-
-prompt_correctness
-
-out1 = prompt_correctness.group_by(['prompt_id']).agg(
-    pl.col("correctness").filter(pl.col("correctness") == 3).count()
-    / pl.col("correctness").count()
-    * 100
-).select(
-    pl.col(['prompt_id']),
-    pl.col('correctness').alias("correct_rate_with_indecisive")
+table1
+
+table1.write_csv('../output/report_tables/1_number_of_average_answers.csv')
+
+# Table 2. Correct / Wrong / Very Wrong / Indecisive Rates
+table2 = results.group_by(
+    ['model_configuration_id', 'prompt_variation_id']
+).agg(
+    pl.col('result').count().alias('total_questions_asked'),
+    (pl.col('result').filter(pl.col('result') == 'correct').count()
+     / pl.col('result').count()
+     * 100).alias("Correct Rate %"),
+    (pl.col('result').filter(pl.col('result') == 'wrong').count()
+     / pl.col('result').count()
+     * 100).alias("Wrong Rate %"),
+    (pl.col('result').filter(pl.col('result') == 'very_wrong').count()
+     / pl.col('result').count()
+     * 100).alias("Very Wrong Rate %"),
+    (pl.col('result').filter(pl.col('result').is_in(['indecisive', 'fail'])).count()
+     / pl.col('result').count()
+     * 100).alias("Indecisive Rate %")
 )
 
-out1
-
-out1.write_csv('../output/correct_rate_with_indecisive_by_prompt.csv')
+# double check
+table2.with_columns(
+    (pl.col('Correct Rate %') +
+     pl.col('Wrong Rate %') +
+     pl.col('Very Wrong Rate %') +
+     pl.col('Indecisive Rate %')).alias('total')
+)['total'].min()  # should be about 100
 
+table2 = table2.with_columns(
+    pl.col('model_configuration_id').replace(model_config_id_mapping).alias('model_name')
+)
 
-out2 = (
-    prompt_correctness
-    .filter(pl.col("correctness") != 0)
-    .group_by(['prompt_id'])
-    .agg(
-        pl.col("correctness").filter(pl.col("correctness") == 3).count()
-        / pl.col("correctness").count()
-        * 100
-    )
-).select(
-    pl.col(['prompt_id']),
-    pl.col('correctness').alias("correct_rate_without_indecisive")
+table2
+
+table2.write_csv('../output/report_tables/2_average_rates.csv')
+
+
+# Table 3. correct rate by prompt
+# don't use 20231104 result in this table. Because in that experiment
+# we didn't test prompt3.
+table3 = results.filter(
+    ~pl.col('last_evaluation_datetime').is_in(['20231104'])
+).group_by(
+    ['prompt_variation_id']
+).agg(
+    pl.col('result').count().alias('total_questions_asked'),
+    (pl.col('result').filter(pl.col('result') == 'correct').count()
+     / pl.col('result').count()
+     * 100).alias("Correct Rate %"),
+    (pl.col('result').filter(pl.col('result') == 'wrong').count()
+     / pl.col('result').count()
+     * 100).alias("Wrong Rate %"),
+    (pl.col('result').filter(pl.col('result') == 'very_wrong').count()
+     / pl.col('result').count()
+     * 100).alias("Very Wrong Rate %"),
+    (pl.col('result').filter(pl.col('result').is_in(['indecisive', 'fail'])).count()
+     / pl.col('result').count()
+     * 100).alias("Indecisive Rate %")
 )
 
-out2
+table3
 
-out2.write_csv('../output/correct_rate_without_indecisive_by_prompt.csv')
+table3.write_csv('../output/report_tables/3_correct_rate_by_prompt.csv')