diff --git a/setup.py b/setup.py index cbfcfd43428524..922258d65efab7 100644 --- a/setup.py +++ b/setup.py @@ -117,7 +117,7 @@ "fugashi>=1.0", "GitPython<3.1.19", "hf-doc-builder>=0.3.0", - "huggingface-hub>=0.23.2,<1.0", + "huggingface-hub>=0.24.0,<1.0", "importlib_metadata", "ipadic>=1.0.0,<2.0", "isort>=5.5.4", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index a633f54a4af1a8..9543b58ad40d91 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -24,7 +24,7 @@ "fugashi": "fugashi>=1.0", "GitPython": "GitPython<3.1.19", "hf-doc-builder": "hf-doc-builder>=0.3.0", - "huggingface-hub": "huggingface-hub>=0.23.2,<1.0", + "huggingface-hub": "huggingface-hub>=0.24.0,<1.0", "importlib_metadata": "importlib_metadata", "ipadic": "ipadic>=1.0.0,<2.0", "isort": "isort>=5.5.4", diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 0df59d1db8e05b..71555e2f7c3162 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -94,7 +94,7 @@ replace_return_docstrings, strtobool, ) -from .utils.hub import convert_file_size_to_int, create_and_tag_model_card, get_checkpoint_shard_files +from .utils.hub import create_and_tag_model_card, get_checkpoint_shard_files from .utils.import_utils import ( ENV_VARS_TRUE_VALUES, is_sagemaker_mp_enabled, @@ -381,92 +381,6 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi return False -def shard_checkpoint( - state_dict: Dict[str, torch.Tensor], max_shard_size: Union[int, str] = "10GB", weights_name: str = WEIGHTS_NAME -): - """ - Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a - given size. - - The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no - optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the - limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB], - [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB]. - - - - If one of the model's weight is bigger than `max_shard_size`, it will end up in its own sub-checkpoint which will - have a size greater than `max_shard_size`. - - - - Args: - state_dict (`Dict[str, torch.Tensor]`): The state dictionary of a model to save. - max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`): - The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit - (like `"5MB"`). - weights_name (`str`, *optional*, defaults to `"pytorch_model.bin"`): - The name of the model save file. - """ - logger.warning( - "Note that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using " - "split_torch_state_dict_into_shards from huggingface_hub library" - ) - max_shard_size = convert_file_size_to_int(max_shard_size) - - sharded_state_dicts = [{}] - last_block_size = 0 - total_size = 0 - storage_id_to_block = {} - - for key, weight in state_dict.items(): - # when bnb serialization is used the weights in the state dict can be strings - # check: https://github.com/huggingface/transformers/pull/24416 for more details - if isinstance(weight, str): - continue - else: - storage_id = id_tensor_storage(weight) - - # If a `weight` shares the same underlying storage as another tensor, we put `weight` in the same `block` - if storage_id in storage_id_to_block and weight.device != torch.device("meta"): - block_id = storage_id_to_block[storage_id] - sharded_state_dicts[block_id][key] = weight - continue - - weight_size = weight.numel() * dtype_byte_size(weight.dtype) - # If this weight is going to tip up over the maximal size, we split, but only if we have put at least one - # weight in the current shard. - if last_block_size + weight_size > max_shard_size and len(sharded_state_dicts[-1]) > 0: - sharded_state_dicts.append({}) - last_block_size = 0 - - sharded_state_dicts[-1][key] = weight - last_block_size += weight_size - total_size += weight_size - storage_id_to_block[storage_id] = len(sharded_state_dicts) - 1 - - # If we only have one shard, we return it - if len(sharded_state_dicts) == 1: - return {weights_name: sharded_state_dicts[0]}, None - - # Otherwise, let's build the index - weight_map = {} - shards = {} - for idx, shard in enumerate(sharded_state_dicts): - shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin") - shard_file = shard_file.replace( - ".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors" - ) - shards[shard_file] = shard - for key in shard.keys(): - weight_map[key] = shard_file - - # Add the metadata - metadata = {"total_size": total_size} - index = {"metadata": metadata, "weight_map": weight_map} - return shards, index - - def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True): """ This is the same as diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 08e42d1c8f70cb..d34528b7431453 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -2203,7 +2203,7 @@ def forward( logger.warning_once( "Expanding inputs for image tokens in BLIP-2 should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) attention_mask = torch.cat( @@ -2326,7 +2326,7 @@ def generate( logger.warning_once( "Expanding inputs for image tokens in BLIP-2 should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) attention_mask = torch.cat( diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index c6852378412895..4129920f9b3663 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -153,7 +153,7 @@ def __call__( logger.warning_once( "Expanding inputs for image tokens in BLIP-2 should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) # cast to desired return tensors type diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index a78a3b66877429..e5622185bc39a8 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -1471,7 +1471,7 @@ def forward( logger.warning_once( "Expanding inputs for image tokens in InstructBLIP should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) attention_mask = torch.cat( @@ -1610,7 +1610,7 @@ def generate( logger.warning_once( "Expanding inputs for image tokens in InstructBLIP should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) attention_mask = torch.cat( diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py index 3d48839d376c5c..a96d97fb07e1d9 100644 --- a/src/transformers/models/instructblip/processing_instructblip.py +++ b/src/transformers/models/instructblip/processing_instructblip.py @@ -148,7 +148,7 @@ def __call__( logger.warning_once( "Expanding inputs for image tokens in InstructBLIP should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) # cast to desired return tensors type after concatenating diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 6d6bf4a6f38e3f..e8536ee50f94bb 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -485,7 +485,7 @@ def forward( "Expanding inputs for image tokens in LLaVa should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) # prefill stage vs decoding stage (legacy behavior copied) if input_ids.shape[1] != 1: diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index 0ff40acc405224..820fa581711a63 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -160,7 +160,7 @@ def __call__( "Expanding inputs for image tokens in LLaVa should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 2d23c48225cd00..269663c7d6141a 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -868,7 +868,7 @@ def forward( "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) if input_ids.shape[1] != 1: inputs_embeds = inputs_embeds.to(image_features.dtype) diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index 310083c1ce53ac..89b885f0f1abb2 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -143,7 +143,7 @@ def __call__( "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) else: image_sizes = iter(image_inputs["image_sizes"]) diff --git a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py index 502aa78263649a..284d8a3d454848 100644 --- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py +++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py @@ -21,10 +21,11 @@ import types import torch +from huggingface_hub import split_torch_state_dict_into_shards from packaging import version from transformers import AutoTokenizer, GPT2Config -from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME, shard_checkpoint +from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME def add_checkpointing_args(parser): @@ -571,7 +572,15 @@ def convert_checkpoint_from_megatron_to_transformers(args): # Store the state_dict to file. max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size - shards, index = shard_checkpoint(output_state_dict, max_shard_size=max_shard_size) + state_dict_split = split_torch_state_dict_into_shards(output_state_dict, max_shard_size=max_shard_size) + shards = index = None + for tensors in state_dict_split.filename_to_tensors.values(): + shards = {tensor: state_dict[tensor] for tensor in tensors} + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } # Save the model for shard_file, shard in shards.items(): diff --git a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py index 44cf17b1cf1899..a0c97fc4e234ab 100644 --- a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py +++ b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py @@ -21,10 +21,10 @@ import re import torch -from huggingface_hub import hf_hub_download +from huggingface_hub import hf_hub_download, split_torch_state_dict_into_shards from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast, RwkvConfig -from transformers.modeling_utils import WEIGHTS_INDEX_NAME, shard_checkpoint +from transformers.modeling_utils import WEIGHTS_INDEX_NAME NUM_HIDDEN_LAYERS_MAPPING = { @@ -116,7 +116,16 @@ def convert_rmkv_checkpoint_to_hf_format( state_dict = convert_state_dict(state_dict) # 4. Split in shards and save - shards, index = shard_checkpoint(state_dict) + state_dict_split = split_torch_state_dict_into_shards(state_dict) + shards = index = None + for tensors in state_dict_split.filename_to_tensors.values(): + shards = {tensor: state_dict[tensor] for tensor in tensors} + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + for shard_file, shard in shards.items(): torch.save(shard, os.path.join(output_dir, shard_file)) diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index a3b3de33fa66ee..30adcb6ab5c089 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -578,7 +578,7 @@ def forward( "Expanding inputs for image tokens in Video-LLaVa should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) if input_ids.shape[1] != 1: for features, frames in ((image_features, 1), (video_features, num_frames)): diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py index d3c27ef56ca0ce..597d94cc2f0031 100644 --- a/src/transformers/models/video_llava/processing_video_llava.py +++ b/src/transformers/models/video_llava/processing_video_llava.py @@ -149,9 +149,10 @@ def __call__( if encoded_images is not None and (self.patch_size is None or self.vision_feature_select_strategy is None): logger.warning_once( "Expanding inputs for image tokens in Video-LLaVa should be done in processing. " - "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " - "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.44." + "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set " + "directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = " + "{{vision_feature_select_strategy}}`. Using processors without these attributes in the config is " + "deprecated and will throw an error in v4.50." ) # Replace the image/video tokens with the expanded token sequence elif encoded_images is not None: diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 4060f8c8ecd1bf..b45325d2194e24 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -476,7 +476,7 @@ def forward( logger.warning_once( "Expanding inputs for image tokens in VipLLaVa should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) # prefill stage vs decoding stage (legacy behavior copied) if input_ids.shape[1] != 1: diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py index 5fd6251224c3ed..96a30df7e5587f 100644 --- a/tests/utils/test_modeling_utils.py +++ b/tests/utils/test_modeling_utils.py @@ -105,7 +105,6 @@ _find_disjoint, _find_identical, dtype_byte_size, - shard_checkpoint, ) from transformers.pytorch_utils import isin_mps_friendly @@ -668,71 +667,6 @@ def test_no_super_init_config_and_model(self): for p1, p2 in zip(model.parameters(), new_model.parameters()): self.assertTrue(torch.equal(p1, p2)) - def test_shard_checkpoint(self): - # This is the model we will use, total size 340,000 bytes. - model = torch.nn.Sequential( - torch.nn.Linear(100, 200, bias=False), # size 80,000 - torch.nn.Linear(200, 200, bias=False), # size 160,000 - torch.nn.Linear(200, 100, bias=False), # size 80,000 - torch.nn.Linear(100, 50, bias=False), # size 20,000 - ) - state_dict = model.state_dict() - - with self.subTest("No shard when max size is bigger than model size"): - shards, index = shard_checkpoint(state_dict) - self.assertIsNone(index) - self.assertDictEqual(shards, {WEIGHTS_NAME: state_dict}) - - with self.subTest("Test sharding, no weights bigger than max size"): - shards, index = shard_checkpoint(state_dict, max_shard_size="300kB") - # Split is first two layers then last two. - self.assertDictEqual( - index, - { - "metadata": {"total_size": 340000}, - "weight_map": { - "0.weight": "pytorch_model-00001-of-00002.bin", - "1.weight": "pytorch_model-00001-of-00002.bin", - "2.weight": "pytorch_model-00002-of-00002.bin", - "3.weight": "pytorch_model-00002-of-00002.bin", - }, - }, - ) - - shard1 = {"0.weight": state_dict["0.weight"], "1.weight": state_dict["1.weight"]} - shard2 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]} - self.assertDictEqual( - shards, {"pytorch_model-00001-of-00002.bin": shard1, "pytorch_model-00002-of-00002.bin": shard2} - ) - - with self.subTest("Test sharding with weights bigger than max size"): - shards, index = shard_checkpoint(state_dict, max_shard_size="100kB") - # Split is first layer, second layer then last 2. - self.assertDictEqual( - index, - { - "metadata": {"total_size": 340000}, - "weight_map": { - "0.weight": "pytorch_model-00001-of-00003.bin", - "1.weight": "pytorch_model-00002-of-00003.bin", - "2.weight": "pytorch_model-00003-of-00003.bin", - "3.weight": "pytorch_model-00003-of-00003.bin", - }, - }, - ) - - shard1 = {"0.weight": state_dict["0.weight"]} - shard2 = {"1.weight": state_dict["1.weight"]} - shard3 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]} - self.assertDictEqual( - shards, - { - "pytorch_model-00001-of-00003.bin": shard1, - "pytorch_model-00002-of-00003.bin": shard2, - "pytorch_model-00003-of-00003.bin": shard3, - }, - ) - def test_checkpoint_sharding_local_bin(self): model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")