Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/transformers/feature_extraction_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,10 @@ def get_feature_extractor_dict(
with open(resolved_feature_extractor_file, encoding="utf-8") as reader:
text = reader.read()
feature_extractor_dict = json.loads(text)
feature_extractor_dict = feature_extractor_dict.get("feature_extractor", feature_extractor_dict)
if "audio_processor" in feature_extractor_dict:
feature_extractor_dict = feature_extractor_dict["audio_processor"]
else:
feature_extractor_dict = feature_extractor_dict.get("feature_extractor", feature_extractor_dict)

except json.JSONDecodeError:
raise OSError(
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/auto/processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
processor_config_file = cached_file(pretrained_model_name_or_path, PROCESSOR_NAME, **cached_file_kwargs)
if processor_config_file is not None:
config_dict, _ = ProcessorMixin.get_processor_dict(pretrained_model_name_or_path, **kwargs)
processor_class = config_dict.get("processor_class", None)
processor_class = config_dict.get("processor_class")
if "AutoProcessor" in config_dict.get("auto_map", {}):
processor_auto_map = config_dict["auto_map"]["AutoProcessor"]

Expand Down
142 changes: 43 additions & 99 deletions src/transformers/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,6 @@ class ProcessorMixin(PushToHubMixin):
"""

attributes = ["feature_extractor", "tokenizer"]
optional_attributes = ["chat_template", "audio_tokenizer"]
optional_call_args: list[str] = []
# Names need to be attr_class for attr in attributes
feature_extractor_class = None
Expand All @@ -534,21 +533,18 @@ class ProcessorMixin(PushToHubMixin):

# args have to match the attributes class attribute
def __init__(self, *args, **kwargs):
# First, extract optional attributes from kwargs if present
# Optional attributes can never be positional arguments
for optional_attribute in self.optional_attributes:
optional_attribute_value = kwargs.pop(optional_attribute, None)
setattr(self, optional_attribute, optional_attribute_value)
# First, extract chat template from kwargs. It can never be a positional arg
setattr(self, "chat_template", kwargs.pop("chat_template", None))

# Check audio tokenizer for its class but do not treat it as attr to avoid saving weights
if optional_attribute == "audio_tokenizer" and optional_attribute_value is not None:
proper_class = self.check_argument_for_proper_class(optional_attribute, optional_attribute_value)

if not (is_torch_available() and isinstance(optional_attribute_value, PreTrainedAudioTokenizerBase)):
raise ValueError(
f"Tried to use `{proper_class}` for audio tokenization. However, this class is not"
" registered for audio tokenization."
)
# Check audio tokenizer for its class but do not treat it as attr to avoid saving weights
if (audio_tokenizer := kwargs.pop("audio_tokenizer", None)) is not None:
proper_class = self.check_argument_for_proper_class("audio_tokenizer", audio_tokenizer)
if not (is_torch_available() and isinstance(audio_tokenizer, PreTrainedAudioTokenizerBase)):
raise ValueError(
f"Tried to use `{proper_class}` for audio tokenization. However, this class is not"
" registered for audio tokenization."
)
setattr(self, "audio_tokenizer", audio_tokenizer)

# Sanitize args and kwargs
for key in kwargs:
Expand Down Expand Up @@ -652,7 +648,7 @@ def check_argument_for_proper_class(self, argument_name, argument):

return proper_class

def to_dict(self, legacy_serialization=True) -> dict[str, Any]:
def to_dict(self) -> dict[str, Any]:
"""
Serializes this instance to a Python dictionary.

Expand All @@ -664,20 +660,19 @@ def to_dict(self, legacy_serialization=True) -> dict[str, Any]:
# Get the kwargs in `__init__`.
sig = inspect.signature(self.__init__)
# Only save the attributes that are presented in the kwargs of `__init__`.
attrs_to_save = list(sig.parameters)
# or in the attributes
attrs_to_save = list(sig.parameters) + self.__class__.attributes
# extra attributes to be kept
attrs_to_save += ["auto_map"]

if legacy_serialization:
# Don't save attributes like `tokenizer`, `image processor` etc. in processor config if `legacy=True`
attrs_to_save = [x for x in attrs_to_save if x not in self.__class__.attributes]

if "tokenizer" in output:
del output["tokenizer"]
if "qformer_tokenizer" in output:
del output["qformer_tokenizer"]
if "protein_tokenizer" in output:
del output["protein_tokenizer"]
if "char_tokenizer" in output:
del output["char_tokenizer"]
if "chat_template" in output:
del output["chat_template"]

Expand All @@ -693,45 +688,40 @@ def cast_array_to_list(dictionary):
dictionary[key] = cast_array_to_list(value)
return dictionary

# Special case, add `audio_tokenizer` dict which points to model weights and path
if "audio_tokenizer" in output:
audio_tokenizer_dict = {
"audio_tokenizer_class": self.audio_tokenizer.__class__.__name__,
"audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path,
}
output["audio_tokenizer"] = audio_tokenizer_dict

# Serialize attributes as a dict
output = {
k: v.to_dict() if isinstance(v, PushToHubMixin) else v
for k, v in output.items()
if (
k in attrs_to_save # keep all attributes that have to be serialized
and v.__class__.__name__ != "BeamSearchDecoderCTC" # remove attributes with that are objects
and (
(legacy_serialization and not isinstance(v, PushToHubMixin)) or not legacy_serialization
) # remove `PushToHubMixin` objects
)
}
output = cast_array_to_list(output)

# Special case, add `audio_tokenizer` dict which points to model weights and path
if not legacy_serialization and "audio_tokenizer" in output:
audio_tokenizer_dict = {
"audio_tokenizer_class": self.audio_tokenizer.__class__.__name__,
"audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path,
}
# Update or overwrite, what do audio tokenizers expect when loading?
output["audio_tokenizer"] = audio_tokenizer_dict

output["processor_class"] = self.__class__.__name__

return output

def to_json_string(self, legacy_serialization=True) -> str:
def to_json_string(self) -> str:
"""
Serializes this instance to a JSON string.

Returns:
`str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
"""
dictionary = self.to_dict(legacy_serialization=legacy_serialization)
dictionary = self.to_dict()

return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"

def to_json_file(self, json_file_path: Union[str, os.PathLike], legacy_serialization=True):
def to_json_file(self, json_file_path: Union[str, os.PathLike]):
"""
Save this instance to a JSON file.

Expand All @@ -740,14 +730,14 @@ def to_json_file(self, json_file_path: Union[str, os.PathLike], legacy_serializa
Path to the JSON file in which this processor instance's parameters will be saved.
"""
with open(json_file_path, "w", encoding="utf-8") as writer:
writer.write(self.to_json_string(legacy_serialization=legacy_serialization))
writer.write(self.to_json_string())

def __repr__(self):
attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
attributes_repr = "\n".join(attributes_repr)
return f"{self.__class__.__name__}:\n{attributes_repr}\n\n{self.to_json_string()}"

def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_serialization: bool = True, **kwargs):
def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
"""
Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
Expand All @@ -768,10 +758,6 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_seri
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
namespace).
legacy_serialization (`bool`, *optional*, defaults to `True`):
Whether or not to save processor attributes in separate config files (legacy) or in processor's config
file as a nested dict. Saving all attributes in a single dict will become the default in future versions.
Set to `legacy_serialization=True` until then.
kwargs (`dict[str, Any]`, *optional*):
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
"""
Expand Down Expand Up @@ -806,20 +792,16 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_seri
save_jinja_files = kwargs.get("save_jinja_files", True)

for attribute_name in self.attributes:
attribute = getattr(self, attribute_name)
if hasattr(attribute, "_set_processor_class"):
attribute._set_processor_class(self.__class__.__name__)

# Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
if attribute_name == "tokenizer":
attribute = getattr(self, attribute_name)
if hasattr(attribute, "_set_processor_class"):
attribute._set_processor_class(self.__class__.__name__)

# Propagate save_jinja_files to tokenizer to ensure we don't get conflicts
attribute.save_pretrained(save_directory, save_jinja_files=save_jinja_files)
elif legacy_serialization:
attribute = getattr(self, attribute_name)
# Include the processor class in attribute config so this processor can then be reloaded with `AutoProcessor` API.
if hasattr(attribute, "_set_processor_class"):
attribute._set_processor_class(self.__class__.__name__)
attribute.save_pretrained(save_directory)
elif attribute._auto_class is not None:
custom_object_save(attribute, save_directory, config=attribute)

if self._auto_class is not None:
# We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up.
Expand All @@ -832,9 +814,7 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_seri
# plus we save chat_template in its own file
output_processor_file = os.path.join(save_directory, PROCESSOR_NAME)
output_chat_template_file_jinja = os.path.join(save_directory, CHAT_TEMPLATE_FILE)
output_chat_template_file_legacy = os.path.join(
save_directory, LEGACY_PROCESSOR_CHAT_TEMPLATE_FILE
) # Legacy filename
output_chat_template_file_legacy = os.path.join(save_directory, LEGACY_PROCESSOR_CHAT_TEMPLATE_FILE)
chat_template_dir = os.path.join(save_directory, CHAT_TEMPLATE_DIR)

# Save `chat_template` in its own file. We can't get it from `processor_dict` as we popped it in `to_dict`
Expand Down Expand Up @@ -877,39 +857,10 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, legacy_seri
"separate files using the `save_jinja_files` argument."
)

if legacy_serialization:
output_audio_tokenizer_file = os.path.join(save_directory, AUDIO_TOKENIZER_NAME)
processor_dict = self.to_dict()

# For now, let's not save to `processor_config.json` if the processor doesn't have extra attributes and
# `auto_map` is not specified.
if set(processor_dict.keys()) != {"processor_class"}:
self.to_json_file(output_processor_file)
logger.info(f"processor saved in {output_processor_file}")

if set(processor_dict.keys()) == {"processor_class"}:
return_files = []
else:
return_files = [output_processor_file]

if self.audio_tokenizer is not None:
audio_tokenizer_class = self.audio_tokenizer.__class__.__name__
audio_tokenizer_name_or_path = self.audio_tokenizer.name_or_path
audio_tokenizer_dict = {
"audio_tokenizer_class": audio_tokenizer_class,
"audio_tokenizer_name_or_path": audio_tokenizer_name_or_path,
}
audio_tokenizer_json = json.dumps(audio_tokenizer_dict, indent=2, sort_keys=True) + "\n"
with open(output_audio_tokenizer_file, "w", encoding="utf-8") as writer:
writer.write(audio_tokenizer_json)

# Create a unified `preprocessor_config.json` and save all attributes as a composite config, except for tokenizers
# NOTE: this will become the default way to save all processor attrbiutes in future versions. Toggled off for now to give
# us time for smoother transition
else:
self.to_json_file(output_processor_file, legacy_serialization=False)
logger.info(f"processor saved in {output_processor_file}")
return_files = [output_processor_file]
self.to_json_file(output_processor_file)
logger.info(f"processor saved in {output_processor_file}")
return_files = [output_processor_file]

if push_to_hub:
self._upload_modified_files(
Expand Down Expand Up @@ -1169,10 +1120,6 @@ def get_processor_dict(
audio_tokenizer_path, **audio_tokenizer_kwargs
)

# Pop attributes if saved in a single processor dict, they are loaded in `_get_arguments_from_pretrained`
for attribute in cls.attributes:
processor_dict.pop(attribute, None)

return processor_dict, kwargs

@classmethod
Expand All @@ -1196,12 +1143,9 @@ def from_args_and_dict(cls, args, processor_dict: dict[str, Any], **kwargs):
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)

# We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs
# If we don't pop, some specific kwargs will raise a warning
if "processor_class" in processor_dict:
del processor_dict["processor_class"]

if "auto_map" in processor_dict:
del processor_dict["auto_map"]
# If we don't pop, some specific kwargs will raise a warning or error
for unused_kwarg in cls.attributes + ["auto_map", "processor_class"]:
processor_dict.pop(unused_kwarg, None)

# override processor_dict with given kwargs
processor_dict.update(kwargs)
Expand Down Expand Up @@ -1461,8 +1405,8 @@ def from_pretrained(
if token is not None:
kwargs["token"] = token

args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls.from_args_and_dict(args, processor_dict, **kwargs)

@classmethod
Expand Down
9 changes: 4 additions & 5 deletions tests/models/align/test_processing_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import shutil
import tempfile
Expand All @@ -23,7 +22,7 @@
from transformers import BertTokenizer, BertTokenizerFast
from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
from transformers.testing_utils import require_vision
from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
from transformers.utils import is_vision_available

from ...test_processing_common import ProcessorTesterMixin

Expand Down Expand Up @@ -67,9 +66,9 @@ def setUp(self):
"image_mean": [0.48145466, 0.4578275, 0.40821073],
"image_std": [0.26862954, 0.26130258, 0.27577711],
}
self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
with open(self.image_processor_file, "w", encoding="utf-8") as fp:
json.dump(image_processor_map, fp)
image_processor = EfficientNetImageProcessor(**image_processor_map)
processor = AlignProcessor(tokenizer=self.get_tokenizer(), image_processor=image_processor)
processor.save_pretrained(self.tmpdirname)

def get_tokenizer(self, **kwargs):
return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
Expand Down
Loading