Skip to content

Commit

Permalink
add system for ligtheval format
Browse files Browse the repository at this point in the history
  • Loading branch information
clefourrier committed Jan 31, 2024
1 parent 3c732d0 commit a1ddda4
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 39 deletions.
46 changes: 34 additions & 12 deletions src/lighteval/tasks/tasks_prompt_formatting.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import ast
import json
import random
import numpy as np
import re
import string

Expand Down Expand Up @@ -115,22 +116,43 @@ def process_path(path: str) -> str:
story.append(text)
return queries

def bbh_harness(line, task_name: str = None):
line = {k: v for k, v in line.items() if v is not None}

task_prefix = line.get("task_prefix", "")
example_input_prefix = line.get("example_input_prefix", "\nQ: ")
query = f"{task_prefix}{example_input_prefix}{line['input']}"

rng = np.random.RandomState(seed=42)
choice_prefix = line.get("choice_prefix", "\n choice: ")
append_choices = bool(line.get("append_choices_to_input", True))
if append_choices:
permuted_choices = list(rng.permutation(sorted(line["choices"])))
query = f"{query}{choice_prefix}{choice_prefix.join(permuted_choices)}"

example_output_prefix = line.get("example_output_prefix", "\nA: ")
query = f"{query}{example_output_prefix}"

gold_item = line["choices"][line["target_idx"]]
correct_index = permuted_choices.index(gold_item)
return Doc(
task_name=task_name,
query=query,
choices=permuted_choices,
gold_index=correct_index,
target_for_fewshot_sorting=permuted_choices,
instruction=line.get("task_prefix", None),
)

def bbh_lighteval(line, task_name: str = None):
query = "", ""
if "task_prefix" in line:
query = line["task_prefix"]
if "example_input_prefix" in line:
query += line["example_input_prefix"]
else:
query += "Q: "
line = {k: v for k, v in line.items() if v is not None}

query = line.get("task_prefix", "")
query +=line.get("example_input_prefix", "\nQuestion: ")
query += line["input"]
if "example_output_prefix" in line:
query += line["example_output_prefix"]
else:
query += "Choices:"
query += line.get("choice_prefix", "\n Choices: ")
query += "".join([f"\n{key}. {choice}" for key, choice in zip(LETTER_INDICES, line["choices"])])
query += "\nAnswer: "
query += line.get("example_output_prefix", "\nAnswer: ")

return Doc(
task_name=task_name,
Expand Down
18 changes: 18 additions & 0 deletions src/lighteval/tasks/tasks_table.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,24 @@
{"name":"bigbench:tracking_shuffled_objects_five_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:tracking_shuffled_objects_seven_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:tracking_shuffled_objects_three_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:causal_judgment","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"causal_judgement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:date_understanding","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"date_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:disambiguation_qa","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:geometric_shapes","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:logical_deduction_five_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:logical_deduction_seven_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:logical_deduction_three_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:movie_recommendation","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:navigate","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"navigate","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:reasoning_about_colored_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:ruin_names","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"ruin_names","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:salient_translation_error_detection","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:snarks","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"snarks","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:sports_understanding","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"sports_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:temporal_sequences","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:tracking_shuffled_objects_five_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:tracking_shuffled_objects_seven_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:tracking_shuffled_objects_three_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:auto_debugging","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"auto_debugging","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:bbq_lite_json:age_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-age_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
{"name":"bigbench:bbq_lite_json:age_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-age_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
Expand Down
45 changes: 18 additions & 27 deletions tasks_examples/bbh.txt
Original file line number Diff line number Diff line change
@@ -1,30 +1,3 @@
harness|bbh:boolean_expressions|0|0
harness|bbh:causal_judgment|0|0
harness|bbh:date_understanding|0|0
harness|bbh:disambiguation_qa|0|0
harness|bbh:dyck_languages|0|0
harness|bbh:formal_fallacies|0|0
harness|bbh:geometric_shapes|0|0
harness|bbh:hyperbaton|0|0
harness|bbh:logical_deduction_five_objects|0|0
harness|bbh:logical_deduction_seven_objects|0|0
harness|bbh:logical_deduction_three_objects|0|0
harness|bbh:movie_recommendation|0|0
harness|bbh:multistep_arithmetic_two|0|0
harness|bbh:navigate|0|0
harness|bbh:object_counting|0|0
harness|bbh:penguins_in_a_table|0|0
harness|bbh:reasoning_about_colored_objects|0|0
harness|bbh:ruin_names|0|0
harness|bbh:salient_translation_error_detection|0|0
harness|bbh:snarks|0|0
harness|bbh:sports_understanding|0|0
harness|bbh:temporal_sequences|0|0
harness|bbh:tracking_shuffled_objects_five_objects|0|0
harness|bbh:tracking_shuffled_objects_seven_objects|0|0
harness|bbh:tracking_shuffled_objects_three_objects|0|0
harness|bbh:web_of_lies|0|0
harness|bbh:word_sorting|0|0
lighteval|bigbench:causal_judgment|0|0
lighteval|bigbench:date_understanding|0|0
lighteval|bigbench:disambiguation_qa|0|0
Expand All @@ -43,3 +16,21 @@ lighteval|bigbench:temporal_sequences|0|0
lighteval|bigbench:tracking_shuffled_objects_five_objects|0|0
lighteval|bigbench:tracking_shuffled_objects_seven_objects|0|0
lighteval|bigbench:tracking_shuffled_objects_three_objects|0|0
harness|bigbench:causal_judgment|0|0
harness|bigbench:date_understanding|0|0
harness|bigbench:disambiguation_qa|0|0
harness|bigbench:geometric_shapes|0|0
harness|bigbench:logical_deduction_five_objects|0|0
harness|bigbench:logical_deduction_seven_objects|0|0
harness|bigbench:logical_deduction_three_objects|0|0
harness|bigbench:movie_recommendation|0|0
harness|bigbench:navigate|0|0
harness|bigbench:reasoning_about_colored_objects|0|0
harness|bigbench:ruin_names|0|0
harness|bigbench:salient_translation_error_detection|0|0
harness|bigbench:snarks|0|0
harness|bigbench:sports_understanding|0|0
harness|bigbench:temporal_sequences|0|0
harness|bigbench:tracking_shuffled_objects_five_objects|0|0
harness|bigbench:tracking_shuffled_objects_seven_objects|0|0
harness|bigbench:tracking_shuffled_objects_three_objects|0|0

0 comments on commit a1ddda4

Please sign in to comment.