Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing issues with multichoice_continuations_start_space - was not parsed properly #232

Merged
merged 6 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions examples/model_configs/base_model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,4 @@ model:
adapter_weights: false # set to True of your model has been trained with peft, also need to provide the base model name
base_model: null # path to the base_model
generation:
multichoice_continuations_start_space: false # Whether to force multiple choice continuations to start with a space
no_multichoice_continuations_start_space: false # Whether to force multiple choice continuations to not start with a space
multichoice_continuations_start_space: null # If true/false, will force multiple choice continuations to start/not start with a space. If none, will do nothing
1 change: 0 additions & 1 deletion examples/nanotron/lighteval_config_override_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ lighteval:
dataset_loading_processes: 8
max_samples: 10
multichoice_continuations_start_space: null
no_multichoice_continuations_start_space: null
num_fewshot_seeds: null
tasks: early-signal
# tasks: custom|hellaswag|0
11 changes: 6 additions & 5 deletions src/lighteval/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,11 @@ def _check_continuations_start_space(self, continuation: str) -> str:
- None (Don't touch - default)
Todo: find a way to add this back WITHOUT breaking compatibility with the harness
"""
if self.multichoice_continuations_start_space is True and continuation[0] != " ":
continuation = " " + continuation
if self.multichoice_continuations_start_space is False and continuation[0] == " ":
continuation = continuation.lstrip()
if self.multichoice_continuations_start_space is not None:
if self.multichoice_continuations_start_space and continuation[0] != " ":
continuation = " " + continuation
if not self.multichoice_continuations_start_space and continuation[0] == " ":
continuation = continuation.lstrip()
return continuation

def _model_call(self, inputs: torch.Tensor) -> torch.Tensor:
Expand Down Expand Up @@ -949,7 +950,7 @@ def loglikelihood_single_token(
if any(len(c) > 1 for c in continuations_enc):
raise ValueError(
f"Trying to do single token multiple choice but one choice has several tokens: {continuations_enc}. "
"If the additional pre-token is a space, try to set --no_multichoice_continuations_start_space "
"If the additional pre-token is a space, try to set `multichoice_continuations_start_space=False` in the model parameters "
)
request.tokenized_continuation = continuations_enc

Expand Down
29 changes: 20 additions & 9 deletions src/lighteval/models/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
NO_AUTOGPTQ_ERROR_MSG,
NO_BNB_ERROR_MSG,
NO_PEFT_ERROR_MSG,
boolstring_to_bool,
is_accelerate_available,
is_autogptq_available,
is_bnb_available,
Expand Down Expand Up @@ -76,6 +77,7 @@ class BaseModelConfig:
space at the start of each continuation in multichoice generation.
For example, context: "What is the capital of France?" and choices: "Paris", "London".
Will be tokenized as: "What is the capital of France? Paris" and "What is the capital of France? London".
True adds a space, False strips a space, None does nothing
subfolder (Optional[str]): The subfolder within the model repository.
revision (str): The revision of the model.
batch_size (int): The batch size for model training.
Expand Down Expand Up @@ -127,6 +129,9 @@ class BaseModelConfig:
compile: bool = False

def __post_init__(self):
# Making sure this parameter is a boolean
self.multichoice_continuations_start_space = boolstring_to_bool(self.multichoice_continuations_start_space)
NathanHB marked this conversation as resolved.
Show resolved Hide resolved

if self.quantization_config is not None and not is_bnb_available():
raise ImportError(NO_BNB_ERROR_MSG)

Expand Down Expand Up @@ -350,15 +355,21 @@ def create_model_config( # noqa: C901
return InferenceModelConfig(model=config["base_params"]["endpoint_name"])

if config["type"] == "base":
# Tests on the multichoice space parameters
multichoice_continuations_start_space = config["generation"]["multichoice_continuations_start_space"]
no_multichoice_continuations_start_space = config["generation"]["no_multichoice_continuations_start_space"]
if not multichoice_continuations_start_space and not no_multichoice_continuations_start_space:
multichoice_continuations_start_space = None
if multichoice_continuations_start_space and no_multichoice_continuations_start_space:
raise ValueError(
"You cannot force both the multichoice continuations to start with a space and not to start with a space"
)
# Creating the multichoice space parameters
# We need to take into account possible conversion issues from our different input formats
multichoice_continuations_start_space = boolstring_to_bool(
config["generation"]["multichoice_continuations_start_space"]
)

if multichoice_continuations_start_space is not None:
if multichoice_continuations_start_space:
hlog(
"You set `multichoice_continuations_start_space` to true. This will force multichoice continuations to use a starting space"
)
else:
hlog(
"You set `multichoice_continuations_start_space` to false. This will remove a leading space from multichoice continuations, if present."
)

# Creating optional quantization configuration
if config["base_params"]["dtype"] == "4bit":
Expand Down
29 changes: 10 additions & 19 deletions src/lighteval/models/nanotron_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
LoglikelihoodRequest,
LoglikelihoodRollingRequest,
)
from lighteval.utils import as_list
from lighteval.utils import as_list, boolstring_to_bool
from lighteval.utils_parallelism import find_executable_batch_size


Expand Down Expand Up @@ -113,20 +113,10 @@ def __init__(
# To implement PP parallelism we need to think about how we want to sync the output for the PP ranks without outputs
raise ValueError("PP parallelism is not supported yet")

multichoice_continuations_start_space = lighteval_config.tasks.multichoice_continuations_start_space
if (
not multichoice_continuations_start_space
and not lighteval_config.tasks.no_multichoice_continuations_start_space
):
multichoice_continuations_start_space = None
# multichoice_continuations_start_space can be True (forcing space), False (forcing no space) or None (no forcing)
if (
# multichoice_continuations_start_space can be True (forcing space), False (forcing no space) or None (no forcing)
multichoice_continuations_start_space = boolstring_to_bool(
lighteval_config.tasks.multichoice_continuations_start_space
and lighteval_config.tasks.no_multichoice_continuations_start_space
):
raise ValueError(
"You cannot force both the multichoice continuations to start with a space and not to start with a space"
)
)

self.generation_config = lighteval_config.generation
if isinstance(self.generation_config, dict):
Expand Down Expand Up @@ -409,10 +399,11 @@ def _check_continuations_start_space(self, continuation: str) -> str:
- None (Don't touch - default)
Todo: find a way to add this back WITHOUT breaking compatibility with the harness
"""
if self.multichoice_continuations_start_space is True and continuation[0] != " ":
continuation = " " + continuation
if self.multichoice_continuations_start_space is False and continuation[0] == " ":
continuation = continuation.lstrip()
if self.multichoice_continuations_start_space is not None:
if self.multichoice_continuations_start_space and continuation[0] != " ":
continuation = " " + continuation
if not self.multichoice_continuations_start_space and continuation[0] == " ":
continuation = continuation.lstrip()
return continuation

def loglikelihood_single_token(
Expand Down Expand Up @@ -443,7 +434,7 @@ def loglikelihood_single_token(
if any(len(c) > 1 for c in continuations_enc):
raise ValueError(
f"Trying to do single token multiple choice but one choice has several tokens: {continuations_enc}. "
"If the additional pre-token is a space, try to set --no_multichoice_continuations_start_space "
"If the additional pre-token is a space, try to set multichoice_continuations_start_space=False in the model parameters "
)
request.tokenized_continuation = continuations_enc

Expand Down
18 changes: 18 additions & 0 deletions src/lighteval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,24 @@ def flatten(item: list[Union[list, str]]) -> list[str]:
return flat_item


def boolstring_to_bool(x: Union[str, bool, int]) -> Union[bool, None]:
"""Allows to manage string or bool to bool conversion, in case a configuration input is badly formatted.

Args:
x (str): A string (true, false, True, False, ...)

Returns:
Union[bool, None]: the corresponding boolean
"""
if x in [None, "None", "null", ""]:
return None
if x in ["True", "true", True, 1]:
return True
if x in ["False", "false", False, 0]:
return False
raise ValueError(f"You tried to convert {x} to a boolean but it's not possible.")


def is_accelerate_available() -> bool:
return importlib.util.find_spec("accelerate") is not None

Expand Down
Loading