Skip to content

Commit

Permalink
Add support for --multiple_choice_generate abcd
Browse files Browse the repository at this point in the history
  • Loading branch information
pasky committed Dec 29, 2024
1 parent 5cca68f commit d9e49af
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 14 deletions.
2 changes: 2 additions & 0 deletions docs/interface.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ This mode supports a number of command-line arguments, the details of which can

- `--fewshot_as_multiturn` : If this flag is on, the Fewshot examples are treated as a multi-turn conversation. Questions are provided as user content and answers are provided as assistant responses. Requires `--num_fewshot` to be set to be greater than 0, and `--apply_chat_template` to be on.

- `--multiple_choice_generate` : If True, multiple choice problems are not evaluated based on lowest logprob continuation, but asking the model to generate the choice letter. This departs from the traditional evaluation methodology, but allows evaluation with popular chat-completion APIs and evaluates each multiple choice problem only once rather than #choice times. Without additional argument, choices must be reproduced verbatim by the model; with additional argument 'abcd' (RECOMMENDED), choices will be lettered and the model has to produce only the corresponding letter.

- `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results.

* `--seed`: Set seed for python's random, numpy and torch. Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three. The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility). E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`. E.g, `--seed 42` sets all three seeds to 42.
Expand Down
8 changes: 6 additions & 2 deletions lm_eval/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,13 +189,17 @@ def setup_parser() -> argparse.ArgumentParser:
)
parser.add_argument(
"--multiple_choice_generate",
action="store_true",
type=str,
nargs="?",
const=True,
default=False,
help=(
"If True, multiple choice problems are not evaluated based on lowest logprob continuation, "
"but asking the model to generate the choice letter. This departs from the traditional evaluation "
"methodology, but allows evaluation with popular chat-completion APIs and evaluates each multiple choice "
"problem only once rather than #choice times."
"problem only once rather than #choice times. Without additional argument, choices must be reproduced "
"verbatim by the model; with additional argument 'abcd', choices will be lettered and the model has to "
"produce only the corresponding letter."
),
)
parser.add_argument(
Expand Down
33 changes: 25 additions & 8 deletions lm_eval/api/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class TaskConfig(dict):
description: str = ""
target_delimiter: str = " "
choice_delimiter: str = " / "
option_delimiter: str = "\n"
fewshot_delimiter: str = "\n\n"
fewshot_config: Optional[dict] = None
# runtime configuration options
Expand Down Expand Up @@ -380,7 +381,7 @@ def build_all_requests(
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
multiple_choice_generate: bool = False,
multiple_choice_generate: Union[bool, str] = False,
chat_template: Optional[Callable] = None,
tokenizer_name: str = "",
) -> None:
Expand Down Expand Up @@ -438,10 +439,13 @@ def build_all_requests(
):
# sample fewshot context #TODO: need to offset doc_id by rank now!
doc_system_instruction = system_instruction or ""
if multiple_choice_generate:
if self.OUTPUT_TYPE == "multiple_choice" and multiple_choice_generate:
if doc_system_instruction:
doc_system_instruction += " "
doc_system_instruction += "Please answer with the letter of the correct answer."
if multiple_choice_generate == "abcd":
doc_system_instruction += "Please include \"ANSWER: <letter>\" in your response with the letter of the correct last answer."
else:
doc_system_instruction += "Please answer with the letter of the correct last answer."

fewshot_ctx = self.fewshot_context(
doc,
Expand Down Expand Up @@ -1034,7 +1038,7 @@ def fewshot_context(
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
multiple_choice_generate: bool = False,
multiple_choice_generate: Union[bool, str] = False,
chat_template: Optional[Callable] = None,
) -> str:
"""Returns a fewshot context string that is made up of a prepended description
Expand All @@ -1050,7 +1054,7 @@ def fewshot_context(
Whether to apply the chat template to the fewshot context.
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param multiple_choice_generate: bool
:param multiple_choice_generate: Union[bool, str]
Whether to generate multiple choice answer from scratch rather than pick by logprobs.
:param chat_template:
callable (from lm.apply_chat_template) that takes in a list[Dict] chat transcript and renders it into a string.
Expand Down Expand Up @@ -1101,8 +1105,13 @@ def fewshot_context(
if self.config.doc_to_choice is not None and multiple_choice_generate:
if not isinstance(example, str):
raise NotImplementedError("--multiple_choice_generate is implemented only for simple text docs")
example += self.config.target_delimiter
example += "(" + self.config.choice_delimiter.join(self.doc_to_choice(doc)) + ")"
if multiple_choice_generate == "abcd":
choices = self.doc_to_choice(doc)
for label, choice in zip(list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")[:len(choices)], choices):
example += f"{self.config.option_delimiter}({label}) {choice}"
else:
example += self.config.target_delimiter
example += "(" + self.config.choice_delimiter.join(self.doc_to_choice(doc)) + ")"

if apply_chat_template:
if self.multiple_input:
Expand Down Expand Up @@ -1319,7 +1328,7 @@ def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]:
return None

def construct_requests(
self, doc: dict, ctx: str, multiple_choice_generate: bool, **kwargs
self, doc: dict, ctx: str, multiple_choice_generate: Union[bool, str], **kwargs
) -> Union[List[Instance], Instance]:
apply_chat_template = kwargs.pop("apply_chat_template", False)

Expand Down Expand Up @@ -1526,6 +1535,14 @@ def process_results(self, doc, results):
# it assumes that doc_to_target returns a number.
choices = self.doc_to_choice(doc)
gold = choices[gold]
if self.multiple_choice_generate == "abcd":
try:
result_label = re.findall(r"ANSWER: ([A-Z])", result)[-1]
result_i = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ").index(result_label)
result = choices[result_i]
except (AttributeError, ValueError, IndexError):
eval_logger.warning(f"[{self}] LLM did not pick a valid result ('{result}')")
result = choices[0] # XXX guess "randomly"
# we expect multiple_targets to be a list.
elif self.multiple_target:
gold = list(gold)
Expand Down
8 changes: 4 additions & 4 deletions lm_eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def simple_evaluate(
system_instruction: Optional[str] = None,
apply_chat_template: Union[bool, str] = False,
fewshot_as_multiturn: bool = False,
multiple_choice_generate: bool = False,
multiple_choice_generate: Union[bool, str] = False,
gen_kwargs: Optional[str] = None,
task_manager: Optional[TaskManager] = None,
verbosity: str = "INFO",
Expand Down Expand Up @@ -120,7 +120,7 @@ def simple_evaluate(
Defaults to False (no chat template applied).
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param multiple_choice_generate: bool
:param multiple_choice_generate: Union[bool, str]
Whether to generate multiple choice answer from scratch rather than pick by logprobs.
:param gen_kwargs: str
String arguments for model generation
Expand Down Expand Up @@ -376,7 +376,7 @@ def evaluate(
system_instruction: Optional[str] = None,
apply_chat_template: Union[bool, str] = False,
fewshot_as_multiturn: bool = False,
multiple_choice_generate: bool = False,
multiple_choice_generate: Union[bool, str] = False,
verbosity: str = "INFO",
):
"""Instantiate and evaluate a model on a list of tasks.
Expand All @@ -402,7 +402,7 @@ def evaluate(
Defaults to False (no chat template applied).
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param multiple_choice_generate: bool
:param multiple_choice_generate: Union[bool, str]
Whether to generate multiple choice answer from scratch rather than pick by logprobs.
:return
Dictionary of results
Expand Down

0 comments on commit d9e49af

Please sign in to comment.