mosaicml · dakinggg · Jun 26, 2025 · May 26, 2025 · Jun 26, 2025 · Jun 26, 2025
@@ -287,6 +287,9 @@ def load_huggingface_tokenizer_from_saved_state(
                     s.load_from_serialized_proto(saved_content['content'])  # pyright: ignore[reportGeneralTypeIssues]
                     with open(tokenizer_file_path, 'wb') as _f:
                         _f.write(s.serialized_model_proto())
+                elif saved_content['file_extension'] == '.jinja':
+                    with open(tokenizer_file_path, 'w', encoding='utf-8') as _f:
+                        _f.write(saved_content['content'])
 
             hf_tokenizer = transformers.AutoTokenizer.from_pretrained(
                 tokenizer_save_dir,
@@ -657,6 +660,9 @@ def get_metadata(self):
                             model_file=str(tokenizer_file_path),  # pyright: ignore[reportGeneralTypeIssues]
                         )
                         tokenizer_file_content = s.serialized_model_proto()
+                    elif tokenizer_file_extension == '.jinja':
+                        with open(tokenizer_file_path, encoding='utf-8') as _tokenizer_file:
+                            tokenizer_file_content = _tokenizer_file.read()
                     else:
                         raise ValueError(
                             f'Unexpected file ending {tokenizer_file_name} in output of tokenizer.save_pretrained.',

@@ -173,7 +173,7 @@ def package_files(prefix: str, directory: str, extension: str):
 ]
 
 extra_deps['nlp'] = [
-    'transformers>=4.11,!=4.34.0,<4.52',
+    'transformers>=4.11,!=4.34.0,<4.53',
     'datasets>=2.4,<4',
     'huggingface-hub>=0.21.2,<0.34',
 ]

diff --git a/tests/fixtures/fixtures.py b/tests/fixtures/fixtures.py
@@ -252,7 +252,6 @@ def tiny_bert_config_helper():
 
     config_object = {
         'architectures': ['BertForMaskedLM',],
-        'attn_implementation': 'eager',
         'attention_probs_dropout_prob': 0.1,
         'gradient_checkpointing': False,
         'hidden_act': 'gelu',

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
@@ -1091,7 +1091,6 @@ def test_write_hf_from_composer_direct(tiny_bert_tokenizer, tmp_path):
         'num_attention_heads': 2,
         'num_hidden_layers': 2,
         'intermediate_size': 512,
-        'attn_implementation': 'eager',
     }
     tiny_bert_config = transformers.BertConfig(**tiny_overrides)
     tiny_bert_model = transformers.BertForMaskedLM(tiny_bert_config)
@@ -1533,3 +1532,48 @@ def test_filtered_state_dict(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_con
     state_dict = hf_model.state_dict()
 
     assert len(state_dict.keys()) == 4
+
+
+def test_hf_tokenizer_with_chat_template(tmp_path: Path, tiny_bert_model, tiny_bert_tokenizer):
+    """Test that tokenizers with chat templates (which create .jinja files) can be checkpointed."""
+    pytest.importorskip('transformers')
+
+    # Use the existing tiny_bert_tokenizer and add a chat template to it
+    tokenizer = tiny_bert_tokenizer
+
+    # Add a chat template to force creation of chat_template.jinja file
+    chat_template = "{% for message in messages %}{{ message['role'] + ': ' + message['content'] + '\n' }}{% endfor %}"
+    tokenizer.chat_template = chat_template  # type: ignore
+
+    # This should not fail even with the chat template
+    trainer = get_lm_trainer(tiny_bert_model, tokenizer, str(tmp_path))
+
+    # The checkpoint save should work without raising "Unexpected file ending" error
+    trainer.save_checkpoint(str(tmp_path / 'chat-template-checkpoint.pt'))
+
+    # Verify the checkpoint was created successfully
+    assert (tmp_path / 'chat-template-checkpoint.pt').exists()
+
+    # Verify that the checkpoint actually contains a .jinja file
+    loaded_checkpoint = torch.load(tmp_path / 'chat-template-checkpoint.pt', weights_only=False)
+    hf_state = loaded_checkpoint['state']['integrations']['huggingface']
+    hf_tokenizer_state = hf_state['tokenizer']
+
+    # Check that chat_template.jinja file is present in the checkpoint
+    jinja_files = [filename for filename in hf_tokenizer_state.keys() if filename.endswith('.jinja')]
+    assert len(jinja_files) > 0, 'No .jinja files found in checkpoint'
+    assert 'chat_template.jinja' in hf_tokenizer_state, 'chat_template.jinja not found in checkpoint'
+
+    # Verify the .jinja file has the correct extension and content type
+    jinja_entry = hf_tokenizer_state['chat_template.jinja']
+    assert jinja_entry['file_extension'] == '.jinja', f"Expected .jinja extension, got {jinja_entry['file_extension']}"
+    assert isinstance(jinja_entry['content'], str), f"Expected string content, got {type(jinja_entry['content'])}"
+    assert chat_template in jinja_entry['content'], 'Chat template content not found in .jinja file'
+
+    # Test loading the checkpoint back
+    _, hf_loaded_tokenizer = HuggingFaceModel.hf_from_composer_checkpoint(
+        checkpoint_path=str(tmp_path / 'chat-template-checkpoint.pt'),
+    )
+
+    # Verify the chat template was preserved
+    assert hf_loaded_tokenizer.chat_template == chat_template  # type: ignore