Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add tokenizer logs info #1731

Merged
merged 11 commits into from
Jun 24, 2024
Prev Previous commit
Next Next commit
add no tokenizer case
artemorloff committed Apr 22, 2024
commit dcc779d4d4a76c34363616f4a18a9b18fc691f1b
24 changes: 17 additions & 7 deletions lm_eval/logging_utils.py
Original file line number Diff line number Diff line change
@@ -456,11 +456,21 @@ def add_env_info(storage: Dict[str, Any]):


def add_tokenizer_info(storage: Dict[str, Any], lm):
tokenizer_info = {
"tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id],
"tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id],
"tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id],
"eot_token_id": getattr(lm, "eot_token_id", None),
"max_length": getattr(lm, "max_length", None),
}
if getattr(lm, "tokenizer", False):
tokenizer_info = {
"tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id],
"tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id],
"tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id],
"eot_token_id": getattr(lm, "eot_token_id", None),
"max_length": getattr(lm, "max_length", None),
}
# seems gguf and textsynth do not have tokenizer
else:
tokenizer_info = {
"tokenizer_pad_token": [],
"tokenizer_eos_token": [],
"tokenizer_bos_token": [],
"eot_token_id": None,
"max_length": None,
}
storage.update(tokenizer_info)