Skip to content

Commit

Permalink
Fixing issues with multichoice_continuations_start_space - was not pa…
Browse files Browse the repository at this point in the history
…rsed properly (#232)

* fixing  issues - was not parsed properly

* Update src/lighteval/models/model_config.py

Co-authored-by: Nathan Habib <[email protected]>

* fix

---------

Co-authored-by: Nathan Habib <[email protected]>
  • Loading branch information
clefourrier and NathanHB authored Aug 1, 2024
1 parent cbae17d commit 6041e36
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 36 deletions.
3 changes: 1 addition & 2 deletions examples/model_configs/base_model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,4 @@ model:
adapter_weights: false # set to True of your model has been trained with peft, also need to provide the base model name
base_model: null # path to the base_model
generation:
multichoice_continuations_start_space: false # Whether to force multiple choice continuations to start with a space
no_multichoice_continuations_start_space: false # Whether to force multiple choice continuations to not start with a space
multichoice_continuations_start_space: null # If true/false, will force multiple choice continuations to start/not start with a space. If none, will do nothing
1 change: 0 additions & 1 deletion examples/nanotron/lighteval_config_override_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ lighteval:
dataset_loading_processes: 8
max_samples: 10
multichoice_continuations_start_space: null
no_multichoice_continuations_start_space: null
num_fewshot_seeds: null
tasks: early-signal
# tasks: custom|hellaswag|0
11 changes: 6 additions & 5 deletions src/lighteval/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,11 @@ def _check_continuations_start_space(self, continuation: str) -> str:
- None (Don't touch - default)
Todo: find a way to add this back WITHOUT breaking compatibility with the harness
"""
if self.multichoice_continuations_start_space is True and continuation[0] != " ":
continuation = " " + continuation
if self.multichoice_continuations_start_space is False and continuation[0] == " ":
continuation = continuation.lstrip()
if self.multichoice_continuations_start_space is not None:
if self.multichoice_continuations_start_space and continuation[0] != " ":
continuation = " " + continuation
if not self.multichoice_continuations_start_space and continuation[0] == " ":
continuation = continuation.lstrip()
return continuation

def _model_call(self, inputs: torch.Tensor) -> torch.Tensor:
Expand Down Expand Up @@ -949,7 +950,7 @@ def loglikelihood_single_token(
if any(len(c) > 1 for c in continuations_enc):
raise ValueError(
f"Trying to do single token multiple choice but one choice has several tokens: {continuations_enc}. "
"If the additional pre-token is a space, try to set --no_multichoice_continuations_start_space "
"If the additional pre-token is a space, try to set `multichoice_continuations_start_space=False` in the model parameters "
)
request.tokenized_continuation = continuations_enc

Expand Down
29 changes: 20 additions & 9 deletions src/lighteval/models/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
NO_AUTOGPTQ_ERROR_MSG,
NO_BNB_ERROR_MSG,
NO_PEFT_ERROR_MSG,
boolstring_to_bool,
is_accelerate_available,
is_autogptq_available,
is_bnb_available,
Expand Down Expand Up @@ -76,6 +77,7 @@ class BaseModelConfig:
space at the start of each continuation in multichoice generation.
For example, context: "What is the capital of France?" and choices: "Paris", "London".
Will be tokenized as: "What is the capital of France? Paris" and "What is the capital of France? London".
True adds a space, False strips a space, None does nothing
subfolder (Optional[str]): The subfolder within the model repository.
revision (str): The revision of the model.
batch_size (int): The batch size for model training.
Expand Down Expand Up @@ -127,6 +129,9 @@ class BaseModelConfig:
compile: bool = False

def __post_init__(self):
# Making sure this parameter is a boolean
self.multichoice_continuations_start_space = boolstring_to_bool(self.multichoice_continuations_start_space)

if self.quantization_config is not None and not is_bnb_available():
raise ImportError(NO_BNB_ERROR_MSG)

Expand Down Expand Up @@ -350,15 +355,21 @@ def create_model_config( # noqa: C901
return InferenceModelConfig(model=config["base_params"]["endpoint_name"])

if config["type"] == "base":
# Tests on the multichoice space parameters
multichoice_continuations_start_space = config["generation"]["multichoice_continuations_start_space"]
no_multichoice_continuations_start_space = config["generation"]["no_multichoice_continuations_start_space"]
if not multichoice_continuations_start_space and not no_multichoice_continuations_start_space:
multichoice_continuations_start_space = None
if multichoice_continuations_start_space and no_multichoice_continuations_start_space:
raise ValueError(
"You cannot force both the multichoice continuations to start with a space and not to start with a space"
)
# Creating the multichoice space parameters
# We need to take into account possible conversion issues from our different input formats
multichoice_continuations_start_space = boolstring_to_bool(
config["generation"]["multichoice_continuations_start_space"]
)

if multichoice_continuations_start_space is not None:
if multichoice_continuations_start_space:
hlog(
"You set `multichoice_continuations_start_space` to true. This will force multichoice continuations to use a starting space"
)
else:
hlog(
"You set `multichoice_continuations_start_space` to false. This will remove a leading space from multichoice continuations, if present."
)

# Creating optional quantization configuration
if config["base_params"]["dtype"] == "4bit":
Expand Down
29 changes: 10 additions & 19 deletions src/lighteval/models/nanotron_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
LoglikelihoodRequest,
LoglikelihoodRollingRequest,
)
from lighteval.utils import as_list
from lighteval.utils import as_list, boolstring_to_bool
from lighteval.utils_parallelism import find_executable_batch_size


Expand Down Expand Up @@ -113,20 +113,10 @@ def __init__(
# To implement PP parallelism we need to think about how we want to sync the output for the PP ranks without outputs
raise ValueError("PP parallelism is not supported yet")

multichoice_continuations_start_space = lighteval_config.tasks.multichoice_continuations_start_space
if (
not multichoice_continuations_start_space
and not lighteval_config.tasks.no_multichoice_continuations_start_space
):
multichoice_continuations_start_space = None
# multichoice_continuations_start_space can be True (forcing space), False (forcing no space) or None (no forcing)
if (
# multichoice_continuations_start_space can be True (forcing space), False (forcing no space) or None (no forcing)
multichoice_continuations_start_space = boolstring_to_bool(
lighteval_config.tasks.multichoice_continuations_start_space
and lighteval_config.tasks.no_multichoice_continuations_start_space
):
raise ValueError(
"You cannot force both the multichoice continuations to start with a space and not to start with a space"
)
)

self.generation_config = lighteval_config.generation
if isinstance(self.generation_config, dict):
Expand Down Expand Up @@ -409,10 +399,11 @@ def _check_continuations_start_space(self, continuation: str) -> str:
- None (Don't touch - default)
Todo: find a way to add this back WITHOUT breaking compatibility with the harness
"""
if self.multichoice_continuations_start_space is True and continuation[0] != " ":
continuation = " " + continuation
if self.multichoice_continuations_start_space is False and continuation[0] == " ":
continuation = continuation.lstrip()
if self.multichoice_continuations_start_space is not None:
if self.multichoice_continuations_start_space and continuation[0] != " ":
continuation = " " + continuation
if not self.multichoice_continuations_start_space and continuation[0] == " ":
continuation = continuation.lstrip()
return continuation

def loglikelihood_single_token(
Expand Down Expand Up @@ -443,7 +434,7 @@ def loglikelihood_single_token(
if any(len(c) > 1 for c in continuations_enc):
raise ValueError(
f"Trying to do single token multiple choice but one choice has several tokens: {continuations_enc}. "
"If the additional pre-token is a space, try to set --no_multichoice_continuations_start_space "
"If the additional pre-token is a space, try to set multichoice_continuations_start_space=False in the model parameters "
)
request.tokenized_continuation = continuations_enc

Expand Down
18 changes: 18 additions & 0 deletions src/lighteval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,24 @@ def flatten(item: list[Union[list, str]]) -> list[str]:
return flat_item


def boolstring_to_bool(x: Union[str, bool, int]) -> Union[bool, None]:
"""Allows to manage string or bool to bool conversion, in case a configuration input is badly formatted.
Args:
x (str): A string (true, false, True, False, ...)
Returns:
Union[bool, None]: the corresponding boolean
"""
if x in [None, "None", "null", ""]:
return None
if x in ["True", "true", True, 1]:
return True
if x in ["False", "false", False, 0]:
return False
raise ValueError(f"You tried to convert {x} to a boolean but it's not possible.")


def is_accelerate_available() -> bool:
return importlib.util.find_spec("accelerate") is not None

Expand Down

0 comments on commit 6041e36

Please sign in to comment.