Skip to content

Update transformers requirement from !=4.34.0,<4.52,>=4.11 to >=4.11,!=4.34.0,<4.53 #3862

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions composer/models/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,9 @@ def load_huggingface_tokenizer_from_saved_state(
s.load_from_serialized_proto(saved_content['content']) # pyright: ignore[reportGeneralTypeIssues]
with open(tokenizer_file_path, 'wb') as _f:
_f.write(s.serialized_model_proto())
elif saved_content['file_extension'] == '.jinja':
with open(tokenizer_file_path, 'w', encoding='utf-8') as _f:
_f.write(saved_content['content'])

hf_tokenizer = transformers.AutoTokenizer.from_pretrained(
tokenizer_save_dir,
Expand Down Expand Up @@ -657,6 +660,9 @@ def get_metadata(self):
model_file=str(tokenizer_file_path), # pyright: ignore[reportGeneralTypeIssues]
)
tokenizer_file_content = s.serialized_model_proto()
elif tokenizer_file_extension == '.jinja':
with open(tokenizer_file_path, encoding='utf-8') as _tokenizer_file:
tokenizer_file_content = _tokenizer_file.read()
else:
raise ValueError(
f'Unexpected file ending {tokenizer_file_name} in output of tokenizer.save_pretrained.',
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def package_files(prefix: str, directory: str, extension: str):
]

extra_deps['nlp'] = [
'transformers>=4.11,!=4.34.0,<4.52',
'transformers>=4.11,!=4.34.0,<4.53',
'datasets>=2.4,<4',
'huggingface-hub>=0.21.2,<0.34',
]
Expand Down
1 change: 0 additions & 1 deletion tests/fixtures/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,6 @@ def tiny_bert_config_helper():

config_object = {
'architectures': ['BertForMaskedLM',],
'attn_implementation': 'eager',
'attention_probs_dropout_prob': 0.1,
'gradient_checkpointing': False,
'hidden_act': 'gelu',
Expand Down
46 changes: 45 additions & 1 deletion tests/models/test_hf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1091,7 +1091,6 @@ def test_write_hf_from_composer_direct(tiny_bert_tokenizer, tmp_path):
'num_attention_heads': 2,
'num_hidden_layers': 2,
'intermediate_size': 512,
'attn_implementation': 'eager',
}
tiny_bert_config = transformers.BertConfig(**tiny_overrides)
tiny_bert_model = transformers.BertForMaskedLM(tiny_bert_config)
Expand Down Expand Up @@ -1533,3 +1532,48 @@ def test_filtered_state_dict(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_con
state_dict = hf_model.state_dict()

assert len(state_dict.keys()) == 4


def test_hf_tokenizer_with_chat_template(tmp_path: Path, tiny_bert_model, tiny_bert_tokenizer):
"""Test that tokenizers with chat templates (which create .jinja files) can be checkpointed."""
pytest.importorskip('transformers')

# Use the existing tiny_bert_tokenizer and add a chat template to it
tokenizer = tiny_bert_tokenizer

# Add a chat template to force creation of chat_template.jinja file
chat_template = "{% for message in messages %}{{ message['role'] + ': ' + message['content'] + '\n' }}{% endfor %}"
tokenizer.chat_template = chat_template # type: ignore

# This should not fail even with the chat template
trainer = get_lm_trainer(tiny_bert_model, tokenizer, str(tmp_path))

# The checkpoint save should work without raising "Unexpected file ending" error
trainer.save_checkpoint(str(tmp_path / 'chat-template-checkpoint.pt'))

# Verify the checkpoint was created successfully
assert (tmp_path / 'chat-template-checkpoint.pt').exists()

# Verify that the checkpoint actually contains a .jinja file
loaded_checkpoint = torch.load(tmp_path / 'chat-template-checkpoint.pt', weights_only=False)
hf_state = loaded_checkpoint['state']['integrations']['huggingface']
hf_tokenizer_state = hf_state['tokenizer']

# Check that chat_template.jinja file is present in the checkpoint
jinja_files = [filename for filename in hf_tokenizer_state.keys() if filename.endswith('.jinja')]
assert len(jinja_files) > 0, 'No .jinja files found in checkpoint'
assert 'chat_template.jinja' in hf_tokenizer_state, 'chat_template.jinja not found in checkpoint'

# Verify the .jinja file has the correct extension and content type
jinja_entry = hf_tokenizer_state['chat_template.jinja']
assert jinja_entry['file_extension'] == '.jinja', f"Expected .jinja extension, got {jinja_entry['file_extension']}"
assert isinstance(jinja_entry['content'], str), f"Expected string content, got {type(jinja_entry['content'])}"
assert chat_template in jinja_entry['content'], 'Chat template content not found in .jinja file'

# Test loading the checkpoint back
_, hf_loaded_tokenizer = HuggingFaceModel.hf_from_composer_checkpoint(
checkpoint_path=str(tmp_path / 'chat-template-checkpoint.pt'),
)

# Verify the chat template was preserved
assert hf_loaded_tokenizer.chat_template == chat_template # type: ignore
Loading