huggingface · zucchini-nlp · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025 · Oct 10, 2025
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
@@ -512,7 +512,10 @@ def get_feature_extractor_dict(
             with open(resolved_feature_extractor_file, encoding="utf-8") as reader:
                 text = reader.read()
             feature_extractor_dict = json.loads(text)
-            feature_extractor_dict = feature_extractor_dict.get("feature_extractor", feature_extractor_dict)
+            if "audio_processor" in feature_extractor_dict:
+                feature_extractor_dict = feature_extractor_dict["audio_processor"]
+            else:
+                feature_extractor_dict = feature_extractor_dict.get("feature_extractor", feature_extractor_dict)
 
         except json.JSONDecodeError:
             raise OSError(

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -300,7 +300,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         processor_config_file = cached_file(pretrained_model_name_or_path, PROCESSOR_NAME, **cached_file_kwargs)
         if processor_config_file is not None:
             config_dict, _ = ProcessorMixin.get_processor_dict(pretrained_model_name_or_path, **kwargs)
-            processor_class = config_dict.get("processor_class", None)
+            processor_class = config_dict.get("processor_class")
             if "AutoProcessor" in config_dict.get("auto_map", {}):
                 processor_auto_map = config_dict["auto_map"]["AutoProcessor"]
 

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
@@ -524,7 +524,6 @@ class ProcessorMixin(PushToHubMixin):
     """
 
     attributes = ["feature_extractor", "tokenizer"]
-    optional_attributes = ["chat_template", "audio_tokenizer"]
     optional_call_args: list[str] = []
     # Names need to be attr_class for attr in attributes
     feature_extractor_class = None
@@ -534,21 +533,18 @@ class ProcessorMixin(PushToHubMixin):
 
     # args have to match the attributes class attribute
     def __init__(self, *args, **kwargs):
-        # First, extract optional attributes from kwargs if present
-        # Optional attributes can never be positional arguments
-        for optional_attribute in self.optional_attributes:
-            optional_attribute_value = kwargs.pop(optional_attribute, None)
-            setattr(self, optional_attribute, optional_attribute_value)
+        # First, extract chat template from kwargs. It can never be a positional arg
+        setattr(self, "chat_template", kwargs.pop("chat_template", None))
 
-            # Check audio tokenizer for its class but do not treat it as attr to avoid saving weights
-            if optional_attribute == "audio_tokenizer" and optional_attribute_value is not None:
-                proper_class = self.check_argument_for_proper_class(optional_attribute, optional_attribute_value)
-
-                if not (is_torch_available() and isinstance(optional_attribute_value, PreTrainedAudioTokenizerBase)):
-                    raise ValueError(
-                        f"Tried to use `{proper_class}` for audio tokenization. However, this class is not"
-                        " registered for audio tokenization."
-                    )
+        # Check audio tokenizer for its class but do not treat it as attr to avoid saving weights
+        if (audio_tokenizer := kwargs.pop("audio_tokenizer", None)) is not None:
+            proper_class = self.check_argument_for_proper_class("audio_tokenizer", audio_tokenizer)
+            if not (is_torch_available() and isinstance(audio_tokenizer, PreTrainedAudioTokenizerBase)):
+                raise ValueError(
+                    f"Tried to use `{proper_class}` for audio tokenization. However, this class is not"
+                    " registered for audio tokenization."
+                )
+            setattr(self, "audio_tokenizer", audio_tokenizer)
 
         # Sanitize args and kwargs
         for key in kwargs:
@@ -652,7 +648,7 @@ def check_argument_for_proper_class(self, argument_name, argument):
 
         return proper_class
 
-    def to_dict(self, legacy_serialization=True) -> dict[str, Any]:
+    def to_dict(self) -> dict[str, Any]:
         """
         Serializes this instance to a Python dictionary.
 
@@ -664,20 +660,19 @@ def to_dict(self, legacy_serialization=True) -> dict[str, Any]:
         # Get the kwargs in `__init__`.
         sig = inspect.signature(self.__init__)
         # Only save the attributes that are presented in the kwargs of `__init__`.
-        attrs_to_save = list(sig.parameters)
+        # or in the attributes
+        attrs_to_save = list(sig.parameters) + self.__class__.attributes
         # extra attributes to be kept
         attrs_to_save += ["auto_map"]
 
-        if legacy_serialization:
-            # Don't save attributes like `tokenizer`, `image processor` etc. in processor config if `legacy=True`
-            attrs_to_save = [x for x in attrs_to_save if x not in self.__class__.attributes]
-
         if "tokenizer" in output:
             del output["tokenizer"]
         if "qformer_tokenizer" in output:
             del output["qformer_tokenizer"]
         if "protein_tokenizer" in output:
             del output["protein_tokenizer"]
+        if "char_tokenizer" in output:
+            del output["char_tokenizer"]
         if "chat_template" in output:
             del output["chat_template"]
 
@@ -693,45 +688,40 @@ def cast_array_to_list(dictionary):
                     dictionary[key] = cast_array_to_list(value)
             return dictionary
 
+        # Special case, add `audio_tokenizer` dict which points to model weights and path
+        if "audio_tokenizer" in output:
+            audio_tokenizer_dict = {
+                "audio_tokenizer_class": self.audio_tokenizer.__class__.__name__,
+                "audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path,
+            }
+            output["audio_tokenizer"] = audio_tokenizer_dict
+
         # Serialize attributes as a dict
         output = {
             k: v.to_dict() if isinstance(v, PushToHubMixin) else v
             for k, v in output.items()
             if (
                 k in attrs_to_save  # keep all attributes that have to be serialized
                 and v.__class__.__name__ != "BeamSearchDecoderCTC"  # remove attributes with that are objects
-                and (
-                    (legacy_serialization and not isinstance(v, PushToHubMixin)) or not legacy_serialization
-                )  # remove `PushToHubMixin` objects
             )
         }
         output = cast_array_to_list(output)
-
-        # Special case, add `audio_tokenizer` dict which points to model weights and path
-        if not legacy_serialization and "audio_tokenizer" in output:
-            audio_tokenizer_dict = {
-                "audio_tokenizer_class": self.audio_tokenizer.__class__.__name__,
-                "audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path,
-            }
-            # Update or overwrite, what do audio tokenizers expect when loading?
-            output["audio_tokenizer"] = audio_tokenizer_dict
-
         output["processor_class"] = self.__class__.__name__
 
         return output
 
-    def to_json_string(self, legacy_serialization=True) -> str:
+    def to_json_string(self) -> str:
         """
         Serializes this instance to a JSON string.
 
         Returns:
             `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
         """
-        dictionary = self.to_dict(legacy_serialization=legacy_serialization)
+        dictionary = self.to_dict()
 
         return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
 
-    def to_json_file(self, json_file_path: Union[str, os.PathLike], legacy_serialization=True):
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
         """
         Save this instance to a JSON file.
 
@@ -740,14 +730,14 @@ def to_json_file(self, json_file_path: Union[str, os.PathLike], legacy_serializa
                 Path to the JSON file in which this processor instance's parameters will be saved.
         """
         with open(json_file_path, "w", encoding="utf-8") as writer:
-            writer.write(self.to_json_string(legacy_serialization=legacy_serialization))
+            writer.write(self.to_json_string())
 
     def __repr__(self):
         attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
         attributes_repr = "\n".join(attributes_repr)
         return f"{self.__class__.__name__}:\n{attributes_repr}\n\n{self.to_json_string()}"
 
-    def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_serialization: bool = True, **kwargs):
+    def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
         """
         Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
         can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
@@ -768,10 +758,6 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_seri
                 Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                 repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                 namespace).
-            legacy_serialization (`bool`, *optional*, defaults to `True`):
-                Whether or not to save processor attributes in separate config files (legacy) or in processor's config
-                file as a nested dict. Saving all attributes in a single dict will become the default in future versions.
-                Set to `legacy_serialization=True` until then.
             kwargs (`dict[str, Any]`, *optional*):
                 Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
         """
@@ -806,20 +792,16 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_seri
         save_jinja_files = kwargs.get("save_jinja_files", True)
 
         for attribute_name in self.attributes:
+            attribute = getattr(self, attribute_name)
+            if hasattr(attribute, "_set_processor_class"):
+                attribute._set_processor_class(self.__class__.__name__)
+
             # Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
             if attribute_name == "tokenizer":
-                attribute = getattr(self, attribute_name)
-                if hasattr(attribute, "_set_processor_class"):
-                    attribute._set_processor_class(self.__class__.__name__)
-
                 # Propagate save_jinja_files to tokenizer to ensure we don't get conflicts
                 attribute.save_pretrained(save_directory, save_jinja_files=save_jinja_files)
-            elif legacy_serialization:
-                attribute = getattr(self, attribute_name)
-                # Include the processor class in attribute config so this processor can then be reloaded with `AutoProcessor` API.
-                if hasattr(attribute, "_set_processor_class"):
-                    attribute._set_processor_class(self.__class__.__name__)
-                attribute.save_pretrained(save_directory)
+            elif attribute._auto_class is not None:
+                custom_object_save(attribute, save_directory, config=attribute)
 
         if self._auto_class is not None:
             # We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up.
@@ -832,9 +814,7 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_seri
         # plus we save chat_template in its own file
         output_processor_file = os.path.join(save_directory, PROCESSOR_NAME)
         output_chat_template_file_jinja = os.path.join(save_directory, CHAT_TEMPLATE_FILE)
-        output_chat_template_file_legacy = os.path.join(
-            save_directory, LEGACY_PROCESSOR_CHAT_TEMPLATE_FILE
-        )  # Legacy filename
+        output_chat_template_file_legacy = os.path.join(save_directory, LEGACY_PROCESSOR_CHAT_TEMPLATE_FILE)
         chat_template_dir = os.path.join(save_directory, CHAT_TEMPLATE_DIR)
 
         # Save `chat_template` in its own file. We can't get it from `processor_dict` as we popped it in `to_dict`
@@ -877,39 +857,10 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_seri
                     "separate files using the `save_jinja_files` argument."
                 )
 
-        if legacy_serialization:
-            output_audio_tokenizer_file = os.path.join(save_directory, AUDIO_TOKENIZER_NAME)
-            processor_dict = self.to_dict()
-
-            # For now, let's not save to `processor_config.json` if the processor doesn't have extra attributes and
-            # `auto_map` is not specified.
-            if set(processor_dict.keys()) != {"processor_class"}:
-                self.to_json_file(output_processor_file)
-                logger.info(f"processor saved in {output_processor_file}")
-
-            if set(processor_dict.keys()) == {"processor_class"}:
-                return_files = []
-            else:
-                return_files = [output_processor_file]
-
-            if self.audio_tokenizer is not None:
-                audio_tokenizer_class = self.audio_tokenizer.__class__.__name__
-                audio_tokenizer_name_or_path = self.audio_tokenizer.name_or_path
-                audio_tokenizer_dict = {
-                    "audio_tokenizer_class": audio_tokenizer_class,
-                    "audio_tokenizer_name_or_path": audio_tokenizer_name_or_path,
-                }
-                audio_tokenizer_json = json.dumps(audio_tokenizer_dict, indent=2, sort_keys=True) + "\n"
-                with open(output_audio_tokenizer_file, "w", encoding="utf-8") as writer:
-                    writer.write(audio_tokenizer_json)
-
         # Create a unified `preprocessor_config.json` and save all attributes as a composite config, except for tokenizers
-        # NOTE: this will become the default way to save all processor attrbiutes in future versions. Toggled off for now to give
-        # us time for smoother transition
-        else:
-            self.to_json_file(output_processor_file, legacy_serialization=False)
-            logger.info(f"processor saved in {output_processor_file}")
-            return_files = [output_processor_file]
+        self.to_json_file(output_processor_file)
+        logger.info(f"processor saved in {output_processor_file}")
+        return_files = [output_processor_file]
 
         if push_to_hub:
             self._upload_modified_files(
@@ -1169,10 +1120,6 @@ def get_processor_dict(
                 audio_tokenizer_path, **audio_tokenizer_kwargs
             )
 
-        # Pop attributes if saved in a single processor dict, they are loaded in `_get_arguments_from_pretrained`
-        for attribute in cls.attributes:
-            processor_dict.pop(attribute, None)
-
         return processor_dict, kwargs
 
     @classmethod
@@ -1196,12 +1143,9 @@ def from_args_and_dict(cls, args, processor_dict: dict[str, Any], **kwargs):
         return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
 
         # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs
-        # If we don't pop, some specific kwargs will raise a warning
-        if "processor_class" in processor_dict:
-            del processor_dict["processor_class"]
-
-        if "auto_map" in processor_dict:
-            del processor_dict["auto_map"]
+        # If we don't pop, some specific kwargs will raise a warning or error
+        for unused_kwarg in cls.attributes + ["auto_map", "processor_class"]:
+            processor_dict.pop(unused_kwarg, None)
 
         # override processor_dict with given kwargs
         processor_dict.update(kwargs)
@@ -1461,8 +1405,8 @@ def from_pretrained(
         if token is not None:
             kwargs["token"] = token
 
-        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
         processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
+        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls.from_args_and_dict(args, processor_dict, **kwargs)
 
     @classmethod

diff --git a/tests/models/align/test_processing_align.py b/tests/models/align/test_processing_align.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
 import os
 import shutil
 import tempfile
@@ -23,7 +22,7 @@
 from transformers import BertTokenizer, BertTokenizerFast
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_vision
-from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
+from transformers.utils import is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
@@ -67,9 +66,9 @@ def setUp(self):
             "image_mean": [0.48145466, 0.4578275, 0.40821073],
             "image_std": [0.26862954, 0.26130258, 0.27577711],
         }
-        self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
-        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
-            json.dump(image_processor_map, fp)
+        image_processor = EfficientNetImageProcessor(**image_processor_map)
+        processor = AlignProcessor(tokenizer=self.get_tokenizer(), image_processor=image_processor)
+        processor.save_pretrained(self.tmpdirname)
 
     def get_tokenizer(self, **kwargs):
         return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)