Skip to content

Commit

Permalink
fail gracefully upon tokenizer logging failure (#2038)
Browse files Browse the repository at this point in the history
  • Loading branch information
haileyschoelkopf authored Jun 29, 2024
1 parent cc2d346 commit 2a6acc8
Showing 1 changed file with 23 additions and 9 deletions.
32 changes: 23 additions & 9 deletions lm_eval/loggers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,15 +114,29 @@ def add_env_info(storage: Dict[str, Any]):

def add_tokenizer_info(storage: Dict[str, Any], lm):
if getattr(lm, "tokenizer", False):
tokenizer_info = {
"tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id],
"tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id],
"tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id],
"eot_token_id": getattr(lm, "eot_token_id", None),
"max_length": getattr(lm, "max_length", None),
}
storage.update(tokenizer_info)
# seems gguf and textsynth do not have tokenizer
try:
tokenizer_info = {
"tokenizer_pad_token": [
lm.tokenizer.pad_token,
lm.tokenizer.pad_token_id,
],
"tokenizer_eos_token": [
lm.tokenizer.eos_token,
lm.tokenizer.eos_token_id,
],
"tokenizer_bos_token": [
lm.tokenizer.bos_token,
lm.tokenizer.bos_token_id,
],
"eot_token_id": getattr(lm, "eot_token_id", None),
"max_length": getattr(lm, "max_length", None),
}
storage.update(tokenizer_info)
except Exception as err:
logger.debug(
f"Logging detailed tokenizer info failed with {err}, skipping..."
)
# seems gguf and textsynth do not have tokenizer
else:
logger.debug(
"LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
Expand Down

0 comments on commit 2a6acc8

Please sign in to comment.