From 251a2409c694c29ee28e66c954670c483cf54961 Mon Sep 17 00:00:00 2001 From: James Thewlis Date: Tue, 23 Jul 2024 01:12:16 -0400 Subject: [PATCH 001/200] Add llama3-llava-next-8b to llava_next conversion script (#31395) * Add llama3-llava-next-8b to llava_next conversion script Adds support for the lmms-lab/llama3-llava-next-8b model to the convert_llava_next_weights_to_hf.py script, along with an example prompt generated from the llava_llama_3 conv_template in the LLaVA-NeXT repo. * Exclude <|begin_of_text|> from prompt example This token gets added automatically, so it should not be included in the prompt example. * Add llava-next-72b and llava-next-110b Adds the Qwen-based LLaVA-Next models to the conversion script, along with changes to load the models on multiple GPUs for inference. * Add llama3 and qwen prompt formats to docs * Chat prompt and padding side left for llama3 batched * update * Update src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * remove code * better naming --------- Co-authored-by: raushan Co-authored-by: Raushan Turganbay Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- docs/source/en/model_doc/llava_next.md | 11 ++ .../convert_llava_next_weights_to_hf.py | 125 +++++++++++++----- 2 files changed, 101 insertions(+), 35 deletions(-) diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index b9d06ff97ffa..9e7caa37d7b9 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -100,6 +100,17 @@ print(text_prompt) "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" ``` +[llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llava-next-8b-hf) requires the following format: + +```bash +"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n\nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +``` + +[llava-next-72b-hf](https://huggingface.co/llava-hf/llava-next-72b-hf) and [llava-next-110b-hf](https://huggingface.co/llava-hf/llava-next-110b-hf) require the following format: + +```bash +"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n" +``` ## Usage example diff --git a/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py b/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py index 2c8aefe39dc2..06edc5c9b1ad 100644 --- a/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py +++ b/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py @@ -24,6 +24,7 @@ """ import argparse +import gc import glob import json from pathlib import Path @@ -111,6 +112,16 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): elif model_id == "liuhaotian/llava-v1.6-34b": text_model_id = "NousResearch/Nous-Hermes-2-Yi-34B" image_token_index = 64000 + elif model_id == "lmms-lab/llama3-llava-next-8b": + text_model_id = "meta-llama/Meta-Llama-3-8B-Instruct" + image_token_index = 128256 + elif model_id == "lmms-lab/llava-next-72b": + text_model_id = "Qwen/Qwen1.5-72B-Chat" + image_token_index = 151646 + elif model_id == "lmms-lab/llava-next-110b": + text_model_id = "Qwen/Qwen1.5-110B-Chat" + image_token_index = 151646 + vision_model_id = data["mm_vision_tower"] torch.set_default_dtype(torch.float16) @@ -120,7 +131,7 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=use_fast) tokenizer.add_tokens(AddedToken("", special=True, normalized=False), special_tokens=True) - if model_id == "liuhaotian/llava-v1.6-mistral-7b": + if model_id in ("liuhaotian/llava-v1.6-mistral-7b", "lmms-lab/llama3-llava-next-8b"): # Mistral-7B doesn't have a padding token set yet tokenizer.add_special_tokens({"pad_token": ""}) @@ -151,28 +162,45 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): # We add an image token so we resize the model # Pad to 64 for performance reasons - pad_shape = 64 - vocab_size = config.text_config.vocab_size - if model_id == "liuhaotian/llava-v1.6-34b": - # this one has 3 additional tokens, namely <|startoftext|>, <|endoftext|> and - num_tokens = vocab_size + 3 - else: - # this one has 2 additional tokens, namely and - num_tokens = vocab_size + 2 - model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape) - model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack( - tuple( - (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0])) - ), - dim=0, - ) - model.language_model.lm_head.weight.data[vocab_size:] = torch.stack( - tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))), - dim=0, - ) + # Qwen-based models have extra unused space in the vocab size already, so no need to resize + if model_id not in ["lmms-lab/llava-next-72b", "lmms-lab/llava-next-110b"]: + pad_shape = 64 + vocab_size = config.text_config.vocab_size + if model_id == "liuhaotian/llava-v1.6-34b": + # this one has 3 additional tokens, namely <|startoftext|>, <|endoftext|> and + num_tokens = vocab_size + 3 + else: + # this one has 2 additional tokens, namely and + num_tokens = vocab_size + 2 + model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape) + model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack( + tuple( + ( + dist.sample() + for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]) + ) + ), + dim=0, + ) + model.language_model.lm_head.weight.data[vocab_size:] = torch.stack( + tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))), + dim=0, + ) + + print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}") + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + model.save_pretrained(pytorch_dump_folder_path) + processor.save_pretrained(pytorch_dump_folder_path) + + # Make space so we can load the model properly now. + del state_dict + gc.collect() - device = "cuda:2" - model.to(device) + # Load everything back for inference tests in float32 because prev script was written as that + # Though it's mostly loaded in fp16 as original weights are in fp16 + model = LlavaNextForConditionalGeneration.from_pretrained(pytorch_dump_folder_path, device_map="auto") + processor = LlavaNextProcessor.from_pretrained(pytorch_dump_folder_path) + device = model.device # prepare inputs image = load_image() @@ -182,6 +210,11 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nWhat is shown in this image? ASSISTANT:" elif model_id == "liuhaotian/llava-v1.6-34b": prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" + elif model_id == "lmms-lab/llama3-llava-next-8b": + prompt = "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n\nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" + elif model_id in ["lmms-lab/llava-next-72b", "lmms-lab/llava-next-110b"]: + prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n" + inputs = processor(images=image, text=prompt, return_tensors="pt") # verify inputs @@ -194,8 +227,6 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): original_input_ids = torch.load(filepath, map_location="cpu") # replace -200 by image_token_index (since we use token ID = 32000 for the image token) original_input_ids[original_input_ids == -200] = image_token_index - print(tokenizer.decode([id for id in original_input_ids.tolist()[0] if id != -200])) - assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist() elif model_id == "liuhaotian/llava-v1.6-34b": @@ -243,6 +274,26 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): dtype=torch.float32, device=device, ) + elif model_id == "lmms-lab/llama3-llava-next-8b": + expected_slice = torch.tensor( + [[-3.9648, 1.1396, 3.3145], [-5.3594, -1.5654, -1.9619], [-12.3750, -10.6797, -9.3125]], + dtype=torch.float32, + device=device, + ) + elif model_id == "lmms-lab/llava-next-72b": + # Not yet checked against reference + expected_slice = torch.tensor( + [[3.7148, 3.9277, 3.4395], [-0.4341, 1.1387, 6.5117], [3.2324, 3.4688, 4.1133]], + dtype=torch.float32, + device=device, + ) + elif model_id == "lmms-lab/llava-next-110b": + # Not yet checked against reference + expected_slice = torch.tensor( + [[-2.5449, -1.6738, -2.0371], [1.0811, 3.4961, 5.0312], [1.7803, 2.5137, 2.4277]], + dtype=torch.float32, + device=device, + ) else: raise ValueError(f"Model {model_id} not supported") @@ -268,6 +319,12 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): expected_text = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nWhat is shown in this image? ASSISTANT: The image appears to be a radar chart, also known as a spider chart or star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several variables represented:\n\n- MM-Vet\n- LLa-Va-Bench\n- SEED-Bench\n- MM" elif model_id == "liuhaotian/llava-v1.6-34b": expected_text = "<|im_start|> system\nAnswer the questions. <|im_start|> user\n\nWhat is shown in this image? <|im_start|> assistant\nThe image appears to be a radar chart, also known as a spider chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular chart, there are several datasets represented by different colors and labeled with various acronyms such as MM-Vet, LLaVA-Bench, SEED-Bench, MM-Bench-CN, MM-" + elif model_id == "lmms-lab/llama3-llava-next-8b": + expected_text = 'system\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.user\n\n\nWhat is shown in this image?assistant\n\n\nThe image shows a radar chart, also known as a spider chart or a web chart, which is a type of graph used to display multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the values are plotted along each axis and connected to form a polygon.\n\nIn this particular radar chart, there are several axes labeled with different variables, such as "MM-Vet," "LL' + elif model_id == "lmms-lab/llava-next-72b": + expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image displays a radar chart, also known as a spider chart or a star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the value of each variable is represented by the distance from the center of the chart to the point where the axis intersects with the line representing that variable's value.\n\nIn this particular chart, there are several axes" + elif model_id == "lmms-lab/llava-next-110b": + expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart comparing the performance of different models on various visual question answering (VQA) benchmarks. Each colored line represents a different model, and the distance from the center of the chart indicates the score or performance level of the model on a particular benchmark. The benchmarks are labeled around the edges of the chart, and include VQA v2, GQA, VizWiz, TextVQA, MMBench-CN, MME, and others. The chart allows for a" else: raise ValueError(f"Model {model_id} not supported") @@ -281,7 +338,7 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): inputs = processor( images=[image, cats_image], - text=[prompt, "[INST] \nHow many cats are there? [/INST]"], + text=[prompt, prompt], padding=True, return_tensors="pt", ).to(device) @@ -305,16 +362,11 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True) print(outputs) - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - if push_to_hub: - repo_id = model_id.split("/")[-1] - model.push_to_hub(f"llava-hf/{repo_id}-hf") - processor.push_to_hub(f"llava-hf/{repo_id}-hf") + checkpoint_name = model_id.split("/")[-1] + print(f"Pushing to repo llava-hf/{checkpoint_name}-hf") + model.push_to_hub(f"llava-hf/{checkpoint_name}-hf") + processor.push_to_hub(f"llava-hf/{checkpoint_name}-hf") if __name__ == "__main__": @@ -328,11 +380,14 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): "liuhaotian/llava-v1.6-vicuna-7b", "liuhaotian/llava-v1.6-vicuna-13b", "liuhaotian/llava-v1.6-34b", + "lmms-lab/llama3-llava-next-8b", + "lmms-lab/llava-next-72b", + "lmms-lab/llava-next-110b", ], required=False, ) parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." + "--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model directory." ) parser.add_argument( "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." From 3aefb4ec7f957f9561a410eabc6f9d57b2f0384f Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 23 Jul 2024 10:23:55 +0500 Subject: [PATCH 002/200] LLaVaNeXT: pad on right if training (#32134) * pad on right if training * docs * add tests --- docs/source/en/model_doc/llava-next-video.md | 7 ++++ docs/source/en/model_doc/llava_next.md | 7 ++++ .../models/llava_next/modeling_llava_next.py | 4 +- .../modeling_llava_next_video.py | 4 +- .../llava_next/test_modeling_llava_next.py | 38 ++++++++++++++++-- .../test_modeling_llava_next_video.py | 40 +++++++++++++++++-- 6 files changed, 90 insertions(+), 10 deletions(-) diff --git a/docs/source/en/model_doc/llava-next-video.md b/docs/source/en/model_doc/llava-next-video.md index 88e41efc29c8..48e50f950621 100644 --- a/docs/source/en/model_doc/llava-next-video.md +++ b/docs/source/en/model_doc/llava-next-video.md @@ -43,6 +43,13 @@ The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tre - We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating. + + +- Llava-Next uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding". + + + + - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use tokenizer's `apply_chat_template` to format your prompts correctly. Below is an example of how to do that. We will use [LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) and a conversation history of videos and images. Each content field has to be a list of dicts, as follows: diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index 9e7caa37d7b9..0c25ed32db5a 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -46,6 +46,13 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/ - We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating. + + +- Llava-Next uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding". + + + + - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use the processor's `apply_chat_template` to format your prompts correctly. For that you have to construct a conversation history, passing a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities. Below is an example of how to do that and the list of formats accepted by each checkpoint. We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows: diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 5b897b817330..ad76561df54f 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -518,8 +518,8 @@ def _merge_input_ids_with_image_features( _left_padding = torch.any(attention_mask[:, 0] == 0) _right_padding = torch.any(attention_mask[:, -1] == 0) - left_padding = True - if batch_size > 1: + left_padding = True if not self.training else False + if batch_size > 1 and not self.training: if _left_padding and not _right_padding: left_padding = True elif not _left_padding and _right_padding: diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index f2ccb99e6187..e3264dfd91e1 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -562,8 +562,8 @@ def _merge_input_ids_with_image_features( _left_padding = torch.any(attention_mask[:, 0] == 0) _right_padding = torch.any(attention_mask[:, -1] == 0) - left_padding = True - if batch_size > 1: + left_padding = True if not self.training else False + if batch_size > 1 and not self.training: if _left_padding and not _right_padding: left_padding = True elif not _left_padding and _right_padding: diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index 69794a85d9fe..70d91002a91b 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -123,7 +123,7 @@ def __init__( self.batch_size = 3 self.num_channels = 3 self.image_size = 30 - self.encoder_seq_length = 341 + self.encoder_seq_length = 342 self.image_grid_pinpoints = [[32, 32]] def get_config(self): @@ -156,9 +156,7 @@ def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() config, pixel_values = config_and_inputs input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 - # make attention mask left-padded to avoid issues with "model has no attribute padding_side" attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - attention_mask[:, :1] = 0 # we are giving 3 images let's make sure we pass in 3 image tokens input_ids[:, 1] = config.image_token_index labels = torch.zeros((self.batch_size, self.seq_length), dtype=torch.long, device=torch_device) @@ -473,3 +471,37 @@ def test_small_model_integration_test_batch_matches_single(self): self.processor.decode(output_batched[0], skip_special_tokens=True), self.processor.decode(output_single[0], skip_special_tokens=True), ) + + @slow + @require_bitsandbytes + def test_padding_side_when_merging_inputs(self): + model = LlavaNextForConditionalGeneration.from_pretrained( + "llava-hf/llava-v1.6-mistral-7b-hf", + load_in_4bit=True, + ) + + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + lowres_url = "https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e" + cats_image = Image.open(requests.get(url, stream=True).raw) + lowres_img = Image.open(requests.get(lowres_url, stream=True).raw) + + inputs_batched = self.processor( + [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True + ).to(torch_device) + + # model is in eval mode by default so we should get pad on the left side + # we can check the first hidden-states (aka inputs embeds) + # the first element was lo-res image and we expect the first 1414 tokens to be all pads + output_eval = model(**inputs_batched, output_hidden_states=True) + self.assertTrue((output_eval.hidden_states[0][0, :1414, ...] == 0).all().item()) + + # otherwise padding is on the right side, so it's last 1414 tokens + self.processor.padding_side = "right" + inputs_batched = self.processor( + [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True + ).to(torch_device) + + model.train() + with torch.no_grad(): + output_train = model(**inputs_batched, output_hidden_states=True) + self.assertTrue((output_train.hidden_states[0][0, -1414:, ...] == 0).all().item()) diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index afe3062fb50e..9ba7ef869ddf 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -124,7 +124,7 @@ def __init__( self.batch_size = 3 self.num_channels = 3 self.image_size = 30 - self.encoder_seq_length = 468 + self.encoder_seq_length = 469 self.image_grid_pinpoints = [[32, 32]] def get_config(self): @@ -166,9 +166,7 @@ def prepare_config_and_inputs(self): def prepare_config_and_inputs_for_common(self): config, pixel_values, pixel_values_videos = self.prepare_config_and_inputs() input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 - # make attention mask left-padded to avoid issues with "model has no attribute padding_side" attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - attention_mask[:, :1] = 0 # we are giving 3 images and videos let's make sure we pass in 3 special tokens input_ids[:, 1] = config.image_token_index input_ids[:, 2] = config.video_token_index @@ -453,3 +451,39 @@ def test_small_model_integration_test_batch_matches_single(self): self.processor.decode(output_batched[0], skip_special_tokens=True), self.processor.decode(output_single[0], skip_special_tokens=True), ) + + @slow + @require_bitsandbytes + def test_padding_side_when_merging_inputs(self): + model = LlavaNextVideoForConditionalGeneration.from_pretrained( + "llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True + ) + + inputs_batched = self.processor( + [self.prompt_video, self.prompt_image], + images=[self.image], + videos=[self.video], + return_tensors="pt", + padding=True, + ).to(torch_device) + + # model is in eval mode by default so we should get pad on the left side + # we can check the first hidden-states (aka inputs embeds) + # the first element was lo-res image and we expect the first 1482 tokens to be all pads + output_eval = model(**inputs_batched, output_hidden_states=True) + self.assertTrue((output_eval.hidden_states[0][0, :1482, ...] == 0).all().item()) + + # otherwise padding is on the right side, so it's last 1482 tokens + self.processor.padding_side = "right" + inputs_batched = self.processor( + [self.prompt_video, self.prompt_image], + images=[self.image], + videos=[self.video], + return_tensors="pt", + padding=True, + ).to(torch_device) + + model.train() + with torch.no_grad(): + output_train = model(**inputs_batched, output_hidden_states=True) + self.assertTrue((output_train.hidden_states[0][0, -1482:, ...] == 0).all().item()) From f83c6f1d02fba5e5ced9357b9c9196c76d937af3 Mon Sep 17 00:00:00 2001 From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:54:38 +0800 Subject: [PATCH 003/200] Remove `trust_remote_code` when loading Libri Dummy (#31748) * [whisper integration] use parquet dataset for testing * propagate to others * more propagation * last one --- src/transformers/commands/pt_to_tf.py | 4 +- src/transformers/generation/logits_process.py | 6 +- src/transformers/models/clvp/modeling_clvp.py | 4 +- .../modeling_speech_to_text_2.py | 2 +- .../models/hubert/modeling_hubert.py | 2 +- .../models/hubert/modeling_tf_hubert.py | 4 +- .../modeling_speech_encoder_decoder.py | 2 +- .../speech_to_text/modeling_speech_to_text.py | 4 +- .../modeling_tf_speech_to_text.py | 2 +- .../models/univnet/modeling_univnet.py | 2 +- .../models/wav2vec2/modeling_flax_wav2vec2.py | 6 +- .../models/wav2vec2/modeling_tf_wav2vec2.py | 4 +- .../models/wav2vec2/modeling_wav2vec2.py | 2 +- .../modeling_wav2vec2_conformer.py | 2 +- .../models/whisper/generation_whisper.py | 2 +- .../models/whisper/modeling_flax_whisper.py | 8 +- .../models/whisper/modeling_tf_whisper.py | 6 +- .../models/whisper/modeling_whisper.py | 6 +- ...xtraction_audio_spectrogram_transformer.py | 4 +- .../clap/test_feature_extraction_clap.py | 4 +- tests/models/clap/test_modeling_clap.py | 16 +-- .../clvp/test_feature_extraction_clvp.py | 4 +- tests/models/clvp/test_modeling_clvp.py | 8 +- .../data2vec/test_modeling_data2vec_audio.py | 4 +- .../test_feature_extraction_encodec.py | 4 +- tests/models/encodec/test_modeling_encodec.py | 12 +-- tests/models/hubert/test_modeling_hubert.py | 4 +- .../models/hubert/test_modeling_tf_hubert.py | 4 +- .../test_feature_extraction_pop2piano.py | 4 +- .../pop2piano/test_processor_pop2piano.py | 4 +- .../test_feature_extraction_seamless_m4t.py | 4 +- tests/models/sew/test_modeling_sew.py | 4 +- tests/models/sew_d/test_modeling_sew_d.py | 4 +- .../test_feature_extraction_speech_to_text.py | 4 +- .../test_modeling_speech_to_text.py | 4 +- .../test_modeling_tf_speech_to_text.py | 4 +- .../test_feature_extraction_speecht5.py | 4 +- .../models/speecht5/test_modeling_speecht5.py | 8 +- .../unispeech/test_modeling_unispeech.py | 4 +- .../test_modeling_unispeech_sat.py | 4 +- .../test_feature_extraction_univnet.py | 4 +- tests/models/univnet/test_modeling_univnet.py | 4 +- .../wav2vec2/test_modeling_flax_wav2vec2.py | 4 +- .../wav2vec2/test_modeling_tf_wav2vec2.py | 4 +- .../models/wav2vec2/test_modeling_wav2vec2.py | 4 +- .../test_modeling_wav2vec2_bert.py | 4 +- .../test_modeling_wav2vec2_conformer.py | 4 +- tests/models/wavlm/test_modeling_wavlm.py | 4 +- .../test_feature_extraction_whisper.py | 4 +- .../whisper/test_modeling_flax_whisper.py | 4 +- .../whisper/test_modeling_tf_whisper.py | 2 +- tests/models/whisper/test_modeling_whisper.py | 26 ++--- .../test_pipelines_audio_classification.py | 4 +- ..._pipelines_automatic_speech_recognition.py | 100 +++++------------- tests/pipelines/test_pipelines_common.py | 4 +- tests/utils/test_audio_utils.py | 4 +- 56 files changed, 110 insertions(+), 254 deletions(-) diff --git a/src/transformers/commands/pt_to_tf.py b/src/transformers/commands/pt_to_tf.py index 4df45f7f086a..4002b5e0eb85 100644 --- a/src/transformers/commands/pt_to_tf.py +++ b/src/transformers/commands/pt_to_tf.py @@ -202,9 +202,7 @@ def get_inputs(self, pt_model, tf_dummy_inputs, config): """ def _get_audio_input(): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") speech_samples = ds.sort("id").select(range(2))[:2]["audio"] raw_samples = [x["array"] for x in speech_samples] return raw_samples diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index c9a978f5ee89..b226a059d106 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -1760,7 +1760,7 @@ class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor): >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> # Whisper has `begin_suppress_tokens` set by default (= `[220, 50256]`). 50256 is the EOS token, so this means @@ -1812,7 +1812,7 @@ class SuppressTokensLogitsProcessor(LogitsProcessor): >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> # Whisper has a long list of suppressed tokens. For instance, in this case, the token 1 is suppressed by default. @@ -1901,7 +1901,7 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor): >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[3]["audio"]["array"], return_tensors="pt") >>> input_features = inputs.input_features diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py index 4124e380a3d7..d53bed2a5d12 100644 --- a/src/transformers/models/clvp/modeling_clvp.py +++ b/src/transformers/models/clvp/modeling_clvp.py @@ -1681,7 +1681,7 @@ def get_speech_features( >>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library) >>> text = "This is an example text." - >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() @@ -1754,7 +1754,7 @@ def forward( >>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library) >>> text = "This is an example text." - >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() diff --git a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py index 4db60e0faeb4..8f1a8370933c 100755 --- a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py @@ -831,7 +831,7 @@ def forward( >>> model.config.decoder_start_token_id = tokenizer.bos_token_id >>> # pre-process inputs and labels - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = feature_extractor( ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" ... ) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index fd0c271b6681..da79c2894877 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -1325,7 +1325,7 @@ def forward( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1 diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index 6c2a341927e2..2adfeea5b8b8 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -1471,7 +1471,7 @@ def call( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 @@ -1583,7 +1583,7 @@ def call( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index a46a1d62af11..c2f5dd025909 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -464,7 +464,7 @@ def forward( >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15") >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values >>> # Inference: Translate English speech to German diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index 9832987f4e64..8353a172b212 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -1129,7 +1129,7 @@ def forward( >>> model = Speech2TextModel.from_pretrained("facebook/s2t-small-librispeech-asr") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/s2t-small-librispeech-asr") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = feature_extractor( ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" ... ) @@ -1270,7 +1270,7 @@ def forward( >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor( ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index 6ad680d4fc07..bac1256ca4b6 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -1483,7 +1483,7 @@ def call( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> ds.set_format(type="tf") diff --git a/src/transformers/models/univnet/modeling_univnet.py b/src/transformers/models/univnet/modeling_univnet.py index 887493fdcf55..5b0c659c302a 100644 --- a/src/transformers/models/univnet/modeling_univnet.py +++ b/src/transformers/models/univnet/modeling_univnet.py @@ -525,7 +525,7 @@ def forward( >>> model = UnivNetModel.from_pretrained("dg845/univnet-dev") >>> feature_extractor = UnivNetFeatureExtractor.from_pretrained("dg845/univnet-dev") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> # Resample the audio to the feature extractor's sampling rate. >>> ds = ds.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate)) >>> inputs = feature_extractor( diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py index 7a629e24572a..9a24b9d39fda 100644 --- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py @@ -1076,7 +1076,7 @@ class FlaxWav2Vec2Model(FlaxWav2Vec2PreTrainedModel): ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor( @@ -1195,7 +1195,7 @@ class FlaxWav2Vec2ForCTC(FlaxWav2Vec2PreTrainedModel): ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor( @@ -1396,7 +1396,7 @@ def __call__( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = feature_extractor(ds["speech"][0], return_tensors="np").input_values # Batch size 1 diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index cc8478d5b3c0..a8338e363d94 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -1542,7 +1542,7 @@ def call( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 @@ -1654,7 +1654,7 @@ def call( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 16e50cc06c52..f1d021b58ee5 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -1938,7 +1938,7 @@ def forward( >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") >>> model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1 >>> # compute masked indices diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index 6f631e4683ad..c37dd980d4ed 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -1453,7 +1453,7 @@ def forward( >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large") >>> model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1 >>> # compute masked indices diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py index 0467362ea2c7..4a28eb920385 100644 --- a/src/transformers/models/whisper/generation_whisper.py +++ b/src/transformers/models/whisper/generation_whisper.py @@ -464,7 +464,7 @@ def generate( >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> input_features = inputs.input_features diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py index 9da592c107da..cc4483963c63 100644 --- a/src/transformers/models/whisper/modeling_flax_whisper.py +++ b/src/transformers/models/whisper/modeling_flax_whisper.py @@ -985,7 +985,7 @@ def encode( >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True) - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np") >>> input_features = inputs.input_features >>> encoder_outputs = model.encode(input_features=input_features) @@ -1045,7 +1045,7 @@ def decode( >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True) - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> input_features = processor(ds[0]["audio"]["array"], return_tensors="np").input_features >>> encoder_outputs = model.encode(input_features=input_features) @@ -1297,7 +1297,7 @@ def decode( >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True) - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np") >>> input_features = inputs.input_features >>> encoder_outputs = model.encode(input_features=input_features) @@ -1516,7 +1516,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs): >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True) - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np") >>> input_features = inputs.input_features >>> generated_ids = model.generate(input_ids=input_features) diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py index 6f50141bff9f..18f55dce8a22 100644 --- a/src/transformers/models/whisper/modeling_tf_whisper.py +++ b/src/transformers/models/whisper/modeling_tf_whisper.py @@ -1147,7 +1147,7 @@ def call( >>> model = TFWhisperModel.from_pretrained("openai/whisper-base") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="tf") >>> input_features = inputs.input_features >>> decoder_input_ids = tf.convert_to_tensor([[1, 1]]) * model.config.decoder_start_token_id @@ -1283,7 +1283,7 @@ def call( >>> model = TFWhisperModel.from_pretrained("openai/whisper-base") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="tf") >>> input_features = inputs.input_features >>> decoder_input_ids = tf.convert_to_tensor([[1, 1]]) * model.config.decoder_start_token_id @@ -1413,7 +1413,7 @@ def call( >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="tf") >>> input_features = inputs.input_features diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index 7ba2af00ad81..6db7da4b957c 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -1555,7 +1555,7 @@ def forward( >>> model = WhisperModel.from_pretrained("openai/whisper-base") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") >>> input_features = inputs.input_features >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id @@ -1698,7 +1698,7 @@ def forward( >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> input_features = inputs.input_features @@ -1959,7 +1959,7 @@ def forward( >>> assistant_model = WhisperForCausalLM.from_pretrained("distil-whisper/distil-large-v2") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> sample = ds[0]["audio"] >>> input_features = processor( ... sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt" diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py index 967f1936215e..fbe250908633 100644 --- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py @@ -153,9 +153,7 @@ def test_double_precision_pad(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py index 8f2d6df3cb6c..d0e913df828b 100644 --- a/tests/models/clap/test_feature_extraction_clap.py +++ b/tests/models/clap/test_feature_extraction_clap.py @@ -164,9 +164,7 @@ def test_double_precision_pad(self): # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest._load_datasamples def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index 8e3392133f1f..9f8cc62d2e0f 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -665,9 +665,7 @@ def test_integration_unfused(self): "repeat": 0.0023, } - librispeech_dummy = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") audio_sample = librispeech_dummy[-1] model_id = "laion/clap-htsat-unfused" @@ -694,9 +692,7 @@ def test_integration_fused(self): "pad": -0.000379, } - librispeech_dummy = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") audio_sample = librispeech_dummy[-1] model_id = "laion/clap-htsat-fused" @@ -723,9 +719,7 @@ def test_batched_fused(self): "pad": 0.0006, } - librispeech_dummy = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]] model_id = "laion/clap-htsat-fused" @@ -752,9 +746,7 @@ def test_batched_unfused(self): "pad": 0.0019, } - librispeech_dummy = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]] model_id = "laion/clap-htsat-unfused" diff --git a/tests/models/clvp/test_feature_extraction_clvp.py b/tests/models/clvp/test_feature_extraction_clvp.py index 83be97e86754..db641eaf6145 100644 --- a/tests/models/clvp/test_feature_extraction_clvp.py +++ b/tests/models/clvp/test_feature_extraction_clvp.py @@ -209,9 +209,7 @@ def test_double_precision_pad(self): self.assertTrue(pt_processed.input_features.dtype == torch.float32) def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", Audio(sampling_rate=22050)) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py index 5d17d3fed622..0cf89a745233 100644 --- a/tests/models/clvp/test_modeling_clvp.py +++ b/tests/models/clvp/test_modeling_clvp.py @@ -371,9 +371,7 @@ def get_config(self): def prepare_config_and_inputs(self): _, input_ids, attention_mask = self.clvp_encoder_tester.prepare_config_and_inputs() - ds = datasets.load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() @@ -555,9 +553,7 @@ def test_model_from_pretrained(self): class ClvpIntegrationTest(unittest.TestCase): def setUp(self): self.text = "This is an example text." - ds = datasets.load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) _, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py index 8bb16760ce61..d43128286853 100644 --- a/tests/models/data2vec/test_modeling_data2vec_audio.py +++ b/tests/models/data2vec/test_modeling_data2vec_audio.py @@ -694,9 +694,7 @@ def test_compute_mask_indices_short_audio(self): @slow class Data2VecAudioModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/encodec/test_feature_extraction_encodec.py b/tests/models/encodec/test_feature_extraction_encodec.py index 73c5019b11ed..e56517ac4106 100644 --- a/tests/models/encodec/test_feature_extraction_encodec.py +++ b/tests/models/encodec/test_feature_extraction_encodec.py @@ -138,9 +138,7 @@ def test_double_precision_pad(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py index 0a023894d8a0..cff297be8e00 100644 --- a/tests/models/encodec/test_modeling_encodec.py +++ b/tests/models/encodec/test_modeling_encodec.py @@ -461,9 +461,7 @@ def test_integration_24kHz(self): "1.5": [371955], "24.0": [6659962], } - librispeech_dummy = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") model_id = "facebook/encodec_24khz" model = EncodecModel.from_pretrained(model_id).to(torch_device) @@ -517,9 +515,7 @@ def test_integration_48kHz(self): "3.0": [144259, 146765, 156435, 176871, 161971], "24.0": [1568553, 1294948, 1306190, 1464747, 1663150], } - librispeech_dummy = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") model_id = "facebook/encodec_48khz" model = EncodecModel.from_pretrained(model_id).to(torch_device) @@ -581,9 +577,7 @@ def test_batch_48kHz(self): [85561, 81870, 76953, 48967, 79315, 85442, 81479, 107241], ], } - librispeech_dummy = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") model_id = "facebook/encodec_48khz" model = EncodecModel.from_pretrained(model_id).to(torch_device) diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py index cd801be41d7b..86f2b4119324 100644 --- a/tests/models/hubert/test_modeling_hubert.py +++ b/tests/models/hubert/test_modeling_hubert.py @@ -753,9 +753,7 @@ class HubertModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/hubert/test_modeling_tf_hubert.py b/tests/models/hubert/test_modeling_tf_hubert.py index 35a8d98c233f..3685e6598740 100644 --- a/tests/models/hubert/test_modeling_tf_hubert.py +++ b/tests/models/hubert/test_modeling_tf_hubert.py @@ -609,9 +609,7 @@ class TFHubertModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/pop2piano/test_feature_extraction_pop2piano.py b/tests/models/pop2piano/test_feature_extraction_pop2piano.py index 5a4652ad577c..c67661479759 100644 --- a/tests/models/pop2piano/test_feature_extraction_pop2piano.py +++ b/tests/models/pop2piano/test_feature_extraction_pop2piano.py @@ -136,9 +136,7 @@ def test_call(self): self.assertTrue(input_features.extrapolated_beatstep.ndim == 2) def test_integration(self): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") speech_samples = ds.sort("id").select([0])["audio"] input_speech = [x["array"] for x in speech_samples][0] sampling_rate = [x["sampling_rate"] for x in speech_samples][0] diff --git a/tests/models/pop2piano/test_processor_pop2piano.py b/tests/models/pop2piano/test_processor_pop2piano.py index 634cdd26bd10..06a8bacfd8a4 100644 --- a/tests/models/pop2piano/test_processor_pop2piano.py +++ b/tests/models/pop2piano/test_processor_pop2piano.py @@ -111,9 +111,7 @@ def test_save_load_pretrained_additional_features(self): def get_inputs(self): """get inputs for both feature extractor and tokenizer""" - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") speech_samples = ds.sort("id").select([0])["audio"] input_speech = [x["array"] for x in speech_samples][0] sampling_rate = [x["sampling_rate"] for x in speech_samples][0] diff --git a/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.py b/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.py index d9919e0adea6..a8fca4b90ba9 100644 --- a/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.py @@ -258,9 +258,7 @@ def test_double_precision_pad(self): self.assertTrue(pt_processed.input_features.dtype == torch.float32) def _load_datasample(self, id): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_sample = ds.sort("id")[id]["audio"]["array"] diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py index fe10d994450b..6b21c2e9f712 100644 --- a/tests/models/sew/test_modeling_sew.py +++ b/tests/models/sew/test_modeling_sew.py @@ -494,9 +494,7 @@ class SEWModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py index 9fd94fbfef26..b2efdccdf07c 100644 --- a/tests/models/sew_d/test_modeling_sew_d.py +++ b/tests/models/sew_d/test_modeling_sew_d.py @@ -508,9 +508,7 @@ class SEWDModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py index 6c8861e3d868..9023e8467f73 100644 --- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py +++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py @@ -259,9 +259,7 @@ def test_double_precision_pad(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py index 44672f1c588f..4f19cc01b33c 100644 --- a/tests/models/speech_to_text/test_modeling_speech_to_text.py +++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py @@ -793,9 +793,7 @@ def default_processor(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py index d12174533395..c2fd215f3885 100644 --- a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py +++ b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py @@ -587,9 +587,7 @@ def default_processor(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/speecht5/test_feature_extraction_speecht5.py b/tests/models/speecht5/test_feature_extraction_speecht5.py index f8f7f53cac20..5ec632e7e76c 100644 --- a/tests/models/speecht5/test_feature_extraction_speecht5.py +++ b/tests/models/speecht5/test_feature_extraction_speecht5.py @@ -380,9 +380,7 @@ def test_attention_mask_with_truncation_target(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py index 1d67bb4f8ab5..7a8aab83272b 100644 --- a/tests/models/speecht5/test_modeling_speecht5.py +++ b/tests/models/speecht5/test_modeling_speecht5.py @@ -744,9 +744,7 @@ def default_processor(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] @@ -1771,9 +1769,7 @@ def default_processor(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py index 1804e2c95ef4..d0a1d352243b 100644 --- a/tests/models/unispeech/test_modeling_unispeech.py +++ b/tests/models/unispeech/test_modeling_unispeech.py @@ -549,9 +549,7 @@ def test_model_from_pretrained(self): @slow class UniSpeechModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py index f3d467f0795d..1aa2da20d5ec 100644 --- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py +++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py @@ -806,9 +806,7 @@ def test_model_from_pretrained(self): @slow class UniSpeechSatModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/univnet/test_feature_extraction_univnet.py b/tests/models/univnet/test_feature_extraction_univnet.py index 673faaae9ada..dfa335d15383 100644 --- a/tests/models/univnet/test_feature_extraction_univnet.py +++ b/tests/models/univnet/test_feature_extraction_univnet.py @@ -327,9 +327,7 @@ def test_double_precision_pad(self): self.assertTrue(pt_processed.input_features.dtype == torch.float32) def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", Audio(sampling_rate=self.feat_extract_tester.sampling_rate)) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/univnet/test_modeling_univnet.py b/tests/models/univnet/test_modeling_univnet.py index 4dc28b3c168b..e160c799b786 100644 --- a/tests/models/univnet/test_modeling_univnet.py +++ b/tests/models/univnet/test_modeling_univnet.py @@ -216,9 +216,7 @@ def tearDown(self): torch.cuda.empty_cache() def _load_datasamples(self, num_samples, sampling_rate=24000): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate)) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py index 18252a175243..b91d66654de6 100644 --- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py @@ -489,9 +489,7 @@ def test_sample_negatives_with_attn_mask(self): @slow class FlaxWav2Vec2ModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py index 2f10e3378d73..7ef97290e61c 100644 --- a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py @@ -716,9 +716,7 @@ def tearDown(self): gc.collect() def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py index 51d105a5ee3f..ff7a85218d3a 100644 --- a/tests/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py @@ -1464,9 +1464,7 @@ def tearDown(self): backend_empty_cache(torch_device) def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py index 0fbd000edc8c..80237fea9d1e 100644 --- a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py +++ b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py @@ -855,9 +855,7 @@ def test_sample_negatives_with_mask(self): @slow class Wav2Vec2BertModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter(lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]) speech_samples = speech_samples[:num_samples]["audio"] diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py index ae13a8ecba9d..096d1368ed02 100644 --- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py +++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py @@ -863,9 +863,7 @@ def test_sample_negatives_with_mask(self): @slow class Wav2Vec2ConformerModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter(lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]) speech_samples = speech_samples[:num_samples]["audio"] diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py index 8f4d1e850e00..b20792d83545 100644 --- a/tests/models/wavlm/test_modeling_wavlm.py +++ b/tests/models/wavlm/test_modeling_wavlm.py @@ -491,9 +491,7 @@ def test_model_from_pretrained(self): @slow class WavLMModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py index 579c42519ae0..a8295542f4e3 100644 --- a/tests/models/whisper/test_feature_extraction_whisper.py +++ b/tests/models/whisper/test_feature_extraction_whisper.py @@ -215,9 +215,7 @@ def test_double_precision_pad(self): self.assertTrue(pt_processed.input_features.dtype == torch.float32) def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/whisper/test_modeling_flax_whisper.py b/tests/models/whisper/test_modeling_flax_whisper.py index d5e18d22c2f3..4b8092e800ad 100644 --- a/tests/models/whisper/test_modeling_flax_whisper.py +++ b/tests/models/whisper/test_modeling_flax_whisper.py @@ -410,9 +410,7 @@ def default_processor(self): return WhisperProcessor.from_pretrained("openai/whisper-base") def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py index 97143cc4df51..b200671e048f 100644 --- a/tests/models/whisper/test_modeling_tf_whisper.py +++ b/tests/models/whisper/test_modeling_tf_whisper.py @@ -704,7 +704,7 @@ def test_generate_with_prompt_ids_and_forced_decoder_ids(self): def _load_datasamples(num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index a11097fe7dc3..5a59f7a72517 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -1835,9 +1835,7 @@ def default_processor(self): return WhisperProcessor.from_pretrained("openai/whisper-base") def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] @@ -2718,9 +2716,7 @@ def test_speculative_decoding_distil(self): ) assistant_model.to(torch_device) - dataset = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") sample = dataset[0]["audio"] input_features = processor(sample["array"], return_tensors="pt", sampling_rate=16_000).input_features @@ -2769,9 +2765,7 @@ def test_speculative_decoding_non_distil(self): ) assistant_model.to(torch_device) - dataset = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") sample = dataset[0]["audio"] input_features = processor(sample["array"], return_tensors="pt", sampling_rate=16_000).input_features @@ -2812,7 +2806,7 @@ def test_whisper_longform_single_batch(self): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") model = model.to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean") one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) input_features = processor( @@ -2848,9 +2842,7 @@ def test_whisper_longform_prompt_ids(self): prompt = "Mr. Kilter, Brionno." # let's force Quilter -> Kilter, Brion -> Brionno prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(torch_device) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:-1]", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:-1]") one_audio = np.concatenate([x["array"] for x in ds["audio"]], dtype=np.float32) first_text = ds[0]["text"].lower() @@ -2901,7 +2893,7 @@ def test_whisper_longform_single_batch_prev_cond(self): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") model = model.to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean") one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) input_features = processor( @@ -2983,7 +2975,7 @@ def test_whisper_longform_single_batch_beam(self): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") model = model.to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean") one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) input_features = processor( @@ -3025,7 +3017,7 @@ def test_whisper_longform_multi_batch(self): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") model = model.to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean") one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) audios = [] audios.append(one_audio[110000:]) @@ -3079,7 +3071,7 @@ def test_whisper_longform_multi_batch_prev_cond(self): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") model = model.to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean") one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) audios = [] audios.append(one_audio[110000:]) diff --git a/tests/pipelines/test_pipelines_audio_classification.py b/tests/pipelines/test_pipelines_audio_classification.py index a8c5deb22844..1f403a8be05d 100644 --- a/tests/pipelines/test_pipelines_audio_classification.py +++ b/tests/pipelines/test_pipelines_audio_classification.py @@ -71,9 +71,7 @@ def run_torchaudio(self, audio_classifier): import datasets # test with a local file - dataset = datasets.load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + dataset = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") audio = dataset[0]["audio"]["array"] output = audio_classifier(audio) self.assertEqual( diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index 82c5580f0ea2..d8810f67eec1 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -294,9 +294,7 @@ def test_torch_large(self): output = speech_recognizer(waveform) self.assertEqual(output, {"text": ""}) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) @@ -313,9 +311,7 @@ def test_torch_large_with_input_features(self): output = speech_recognizer(waveform) self.assertEqual(output, {"text": ""}) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "a man said to the universe sir i exist"}) @@ -545,9 +541,7 @@ def test_torch_whisper(self): model="openai/whisper-tiny", framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."}) @@ -722,9 +716,7 @@ def test_find_longest_common_subsequence(self): @slow @require_torch def test_whisper_timestamp_prediction(self): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") array = np.concatenate( [ds[40]["audio"]["array"], ds[41]["audio"]["array"], ds[42]["audio"]["array"], ds[43]["audio"]["array"]] ) @@ -822,9 +814,7 @@ def test_whisper_timestamp_prediction(self): @slow @require_torch def test_whisper_large_timestamp_prediction(self): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") array = np.concatenate( [ds[40]["audio"]["array"], ds[41]["audio"]["array"], ds[42]["audio"]["array"], ds[43]["audio"]["array"]] ) @@ -918,9 +908,7 @@ def test_whisper_word_timestamps_batched(self): chunk_length_s=3, return_timestamps="word", ) - data = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + data = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") sample = data[0]["audio"] # not the same output as test_simple_whisper_asr because of chunking @@ -963,9 +951,7 @@ def test_whisper_large_word_timestamps_batched(self): model="openai/whisper-large-v3", return_timestamps="word", ) - data = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + data = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") sample = data[0]["audio"] # not the same output as test_simple_whisper_asr because of chunking @@ -1010,9 +996,7 @@ def test_torch_speech_encoder_decoder(self): framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": 'Ein Mann sagte zum Universum : " Sir, ich existiert! "'}) @@ -1030,9 +1014,7 @@ def test_simple_wav2vec2(self): output = asr(waveform) self.assertEqual(output, {"text": ""}) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = asr(filename) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) @@ -1058,9 +1040,7 @@ def test_simple_s2t(self): output = asr(waveform) self.assertEqual(output, {"text": "(Applausi)"}) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = asr(filename) self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."}) @@ -1080,9 +1060,7 @@ def test_simple_whisper_asr(self): model="openai/whisper-tiny.en", framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") filename = ds[0]["file"] output = speech_recognizer(filename) self.assertEqual( @@ -1151,9 +1129,7 @@ def test_simple_whisper_translation(self): model="openai/whisper-large", framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."}) @@ -1188,9 +1164,7 @@ def test_whisper_language(self): model="openai/whisper-tiny.en", framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") filename = ds[0]["file"] # 1. English-only model compatible with no language argument @@ -1323,9 +1297,7 @@ def test_xls_r_to_en(self): framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "A man said to the universe: “Sir, I exist."}) @@ -1341,9 +1313,7 @@ def test_xls_r_from_en(self): framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "Ein Mann sagte zu dem Universum, Sir, ich bin da."}) @@ -1360,9 +1330,7 @@ def test_speech_to_text_leveraged(self): framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) @@ -1379,9 +1347,7 @@ def test_wav2vec2_conformer_float16(self): framework="pt", ) - dataset = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") sample = dataset[0]["audio"] output = speech_recognizer(sample) @@ -1398,9 +1364,7 @@ def test_chunking_fast(self): chunk_length_s=10.0, ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 2 @@ -1416,9 +1380,7 @@ def test_return_timestamps_ctc_fast(self): model="hf-internal-testing/tiny-random-wav2vec2", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") # Take short audio to keep the test readable audio = ds[40]["audio"]["array"][:800] @@ -1462,9 +1424,7 @@ def test_chunking_fast_with_lm(self): chunk_length_s=10.0, ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 2 @@ -1492,9 +1452,7 @@ def test_with_lm_fast(self): ) self.assertEqual(speech_recognizer.type, "ctc_with_lm") - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 2 @@ -1522,9 +1480,7 @@ def test_with_local_lm_fast(self): ) self.assertEqual(speech_recognizer.type, "ctc_with_lm") - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 2 @@ -1608,9 +1564,7 @@ def test_seamless_v2(self): device=torch_device, ) - dataset = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") sample = dataset[0]["audio"] result = pipe(sample, generate_kwargs={"tgt_lang": "eng"}) @@ -1633,9 +1587,7 @@ def test_chunking_and_timestamps(self): chunk_length_s=10.0, ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 10 @@ -1747,9 +1699,7 @@ def test_chunking_with_lm(self): model="patrickvonplaten/wav2vec2-base-100h-with-lm", chunk_length_s=10.0, ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 10 diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index d4dbff218558..95349a8335de 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -840,9 +840,7 @@ def test_cached_pipeline_has_minimum_calls_to_head(self): def test_chunk_pipeline_batching_single_file(self): # Make sure we have cached the pipeline. pipe = pipeline(model="hf-internal-testing/tiny-random-Wav2Vec2ForCTC") - ds = datasets.load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] pipe = pipeline(model="hf-internal-testing/tiny-random-Wav2Vec2ForCTC") diff --git a/tests/utils/test_audio_utils.py b/tests/utils/test_audio_utils.py index 47c384870d4a..3e417bf7e3b4 100644 --- a/tests/utils/test_audio_utils.py +++ b/tests/utils/test_audio_utils.py @@ -262,9 +262,7 @@ def test_window_function(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] return [x["array"] for x in speech_samples] From 2782aadae2b0b0c313eac3ee70f84f0335577635 Mon Sep 17 00:00:00 2001 From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:55:16 +0800 Subject: [PATCH 004/200] [modelling] remove un-necessary transpose for fa2 attention (#31749) * [whisper] remove un-necessary transpose for fa2 attention * propagate --- src/transformers/models/idefics2/modeling_idefics2.py | 6 ++---- src/transformers/models/jamba/modeling_jamba.py | 3 +-- src/transformers/models/whisper/modeling_whisper.py | 3 +-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 4d978c053d3f..f57f1fc3d51a 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -301,7 +301,7 @@ def forward( # Flash attention requires the input to have the shape # batch_size x seq_length x head_dim x hidden_dim # therefore we just need to keep the original shape - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) @@ -311,7 +311,6 @@ def forward( # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache # to be able to avoid many of these transpose/reshape/view. - query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) @@ -817,7 +816,7 @@ def forward( key_states = self.k_proj(torch.cat([context, latents], dim=-2)) value_states = self.v_proj(torch.cat([context, latents], dim=-2)) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) key_states = key_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) @@ -882,7 +881,6 @@ def forward( value_states = value_states.to(target_dtype) # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index 768e8e016075..28d732628a28 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -406,7 +406,7 @@ def forward( # Flash attention requires the input to have the shape # batch_size x seq_length x head_dim x hidden_dim # therefore we just need to keep the original shape - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) @@ -469,7 +469,6 @@ def forward( value_states = value_states.to(target_dtype) # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index 6db7da4b957c..c4d59a360464 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -387,7 +387,7 @@ def forward( bsz, tgt_len, _ = hidden_states.size() # get query proj - query_states = self._shape(self.q_proj(hidden_states), tgt_len, bsz) + query_states = torch.reshape(self.q_proj(hidden_states), (bsz, tgt_len, self.num_heads, self.head_dim)) if past_key_value is not None: is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -416,7 +416,6 @@ def forward( # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim] # We would need to refactor the KV cache to be able to avoid many of these transpose/reshape/view. - query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) From 605f3245dcca34381c35520c35ba0b701ed80d58 Mon Sep 17 00:00:00 2001 From: Anton Vlasjuk <73884904+vasqu@users.noreply.github.com> Date: Tue, 23 Jul 2024 10:11:12 +0200 Subject: [PATCH 005/200] Fix mask creations of `GPTNeoX` and `GPT2` (#31944) * fix mask creation of gpt2 and gpt_neox caused by me * forgot the reshape of masks when shape > 2 * add tests for gpt neox and gpt2 * nit on a comment --- src/transformers/models/gpt2/modeling_gpt2.py | 24 ++++++------- .../models/gpt_neox/modeling_gpt_neox.py | 36 +++++++++---------- tests/models/gpt2/test_modeling_gpt2.py | 34 ++++++++++++++++++ .../models/gpt_neox/test_modeling_gpt_neox.py | 34 ++++++++++++++++++ 4 files changed, 97 insertions(+), 31 deletions(-) diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 2b9d300aa9e1..7a51cb3eb2cd 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -1030,18 +1030,18 @@ def forward( # Attention mask. _use_sdpa = self._attn_implementation == "sdpa" and output_attentions is False and head_mask is None - if attention_mask is not None: - attention_mask = attention_mask.view(batch_size, -1) - if self._attn_implementation == "flash_attention_2": - attention_mask = attention_mask if 0 in attention_mask else None - elif _use_sdpa: - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask=attention_mask, - input_shape=(batch_size, input_shape[-1]), - inputs_embeds=inputs_embeds, - past_key_values_length=past_length, - ) - else: + attention_mask = attention_mask.view(batch_size, -1) if attention_mask is not None else None + if self._attn_implementation == "flash_attention_2": + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif _use_sdpa: + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask=attention_mask, + input_shape=(batch_size, input_shape[-1]), + inputs_embeds=inputs_embeds, + past_key_values_length=past_length, + ) + else: + if attention_mask is not None: # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index fd5e0b4fe62e..32988e88df34 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -824,25 +824,23 @@ def forward( inputs_embeds = self.embed_in(input_ids) # Attention mask. - if attention_mask is not None: - assert batch_size > 0, "batch_size has to be defined and > 0" - attention_mask = attention_mask.view(batch_size, -1) - if self._attn_implementation == "flash_attention_2": - attention_mask = attention_mask if 0 in attention_mask else None - elif self._attn_implementation == "sdpa" and not output_attentions and head_mask is None: - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask=attention_mask, - input_shape=(batch_size, seq_length), - inputs_embeds=inputs_embeds, - past_key_values_length=past_length, - ) - else: - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask=attention_mask, - input_shape=(batch_size, seq_length), - inputs_embeds=inputs_embeds, - past_key_values_length=past_length, - ) + attention_mask = attention_mask.view(batch_size, -1) if attention_mask is not None else None + if self._attn_implementation == "flash_attention_2": + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions and head_mask is None: + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask=attention_mask, + input_shape=(batch_size, seq_length), + inputs_embeds=inputs_embeds, + past_key_values_length=past_length, + ) + else: + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask=attention_mask, + input_shape=(batch_size, seq_length), + inputs_embeds=inputs_embeds, + past_key_values_length=past_length, + ) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py index 5755658288f5..3f96c20ab2db 100644 --- a/tests/models/gpt2/test_modeling_gpt2.py +++ b/tests/models/gpt2/test_modeling_gpt2.py @@ -426,6 +426,36 @@ def create_and_check_gpt2_weight_initialization(self, config, *args): self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001) self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01) + def create_and_check_cached_forward_with_and_without_attention_mask(self, config, input_ids, *args): + # Relevant issue: https://github.com/huggingface/transformers/issues/31943 + model = GPT2Model(config) + model.to(torch_device) + model.eval() + + # We want this for SDPA, eager works with a `None` attention mask + assert ( + model.config._attn_implementation == "sdpa" + ), "This test assumes the model to have the SDPA implementation for its attention calculations." + + # Prepare cache and non_cache input, needs a full attention mask + cached_len = input_ids.shape[-1] // 2 + input_mask = torch.ones(size=input_ids.size()).to(torch_device) + cache_inputs = {"input_ids": input_ids[:, :cached_len], "attention_mask": input_mask[:, :cached_len]} + non_cache_inputs = {"input_ids": input_ids[:, cached_len:], "attention_mask": input_mask} + + # Cached forward once with the attention mask provided and the other time without it (which should assume full attention) + cache_outputs = model(**cache_inputs) + full_outputs_with_attention_mask = model( + **non_cache_inputs, past_key_values=cache_outputs.past_key_values + ).last_hidden_state + full_outputs_without_attention_mask = model( + non_cache_inputs["input_ids"], past_key_values=cache_outputs.past_key_values + ).last_hidden_state + + self.parent.assertTrue( + torch.allclose(full_outputs_with_attention_mask, full_outputs_without_attention_mask, atol=1e-5) + ) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() @@ -570,6 +600,10 @@ def test_gpt2_weight_initialization(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_gpt2_weight_initialization(*config_and_inputs) + def test_cached_forward_with_and_without_attention_mask(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_cached_forward_with_and_without_attention_mask(*config_and_inputs) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py index 51a4d235c3bc..af162f50713e 100644 --- a/tests/models/gpt_neox/test_modeling_gpt_neox.py +++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py @@ -219,6 +219,36 @@ def create_and_check_decoder_model_past_large_inputs(self, config, input_ids, in # test that outputs are equal for slice self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + def create_and_check_cached_forward_with_and_without_attention_mask(self, config, input_ids, *args): + # Relevant issue: https://github.com/huggingface/transformers/issues/31943 + model = GPTNeoXModel(config) + model.to(torch_device) + model.eval() + + # We want this for SDPA, eager works with a `None` attention mask + assert ( + model.config._attn_implementation == "sdpa" + ), "This test assumes the model to have the SDPA implementation for its attention calculations." + + # Prepare cache and non_cache input, needs a full attention mask + cached_len = input_ids.shape[-1] // 2 + input_mask = torch.ones(size=input_ids.size()).to(torch_device) + cache_inputs = {"input_ids": input_ids[:, :cached_len], "attention_mask": input_mask[:, :cached_len]} + non_cache_inputs = {"input_ids": input_ids[:, cached_len:], "attention_mask": input_mask} + + # Cached forward once with the attention mask provided and the other time without it (which should assume full attention) + cache_outputs = model(**cache_inputs) + full_outputs_with_attention_mask = model( + **non_cache_inputs, past_key_values=cache_outputs.past_key_values + ).last_hidden_state + full_outputs_without_attention_mask = model( + non_cache_inputs["input_ids"], past_key_values=cache_outputs.past_key_values + ).last_hidden_state + + self.parent.assertTrue( + torch.allclose(full_outputs_with_attention_mask, full_outputs_without_attention_mask, atol=1e-5) + ) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() config, input_ids, input_mask, token_labels = config_and_inputs @@ -300,6 +330,10 @@ def test_model_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + def test_cached_forward_with_and_without_attention_mask(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_cached_forward_with_and_without_attention_mask(*config_and_inputs) + @unittest.skip(reason="Feed forward chunking is not implemented") def test_feed_forward_chunking(self): pass From 7405c1c77e4637768ea0ad5d27d8a4d8d67bfb19 Mon Sep 17 00:00:00 2001 From: KonradSzafer <61851539+KonradSzafer@users.noreply.github.com> Date: Tue, 23 Jul 2024 10:56:21 +0200 Subject: [PATCH 006/200] Add method to retrieve used chat template (#32032) encapsulate chat template logic --- src/transformers/tokenization_utils_base.py | 120 ++++++++++++-------- 1 file changed, 72 insertions(+), 48 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 58052579f2be..7ffd3bbcaa6b 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1772,54 +1772,7 @@ def apply_chat_template( if tokenizer_kwargs is None: tokenizer_kwargs = {} - using_default_template = False - - # First, handle the cases when the model has a dict of multiple templates - if isinstance(self.chat_template, dict) or ( - self.chat_template is None and isinstance(self.default_chat_template, dict) - ): - if self.chat_template is not None: - template_dict = self.chat_template - using_default_dict = False - else: - template_dict = self.default_chat_template - using_default_dict = True - if chat_template is not None and chat_template in template_dict: - # The user can pass the name of a template to the chat template argument instead of an entire template - chat_template = template_dict[chat_template] - if using_default_dict: - using_default_template = True - elif chat_template is None: - if tools is not None and "tool_use" in template_dict: - chat_template = template_dict["tool_use"] - elif "default" in template_dict: - chat_template = template_dict["default"] - else: - raise ValueError( - "This model has multiple chat templates with no default specified! Please either pass a chat " - "template or the name of the template you wish to use to the `chat_template` argument. Available " - f"template names are {sorted(template_dict.keys())}." - ) - if using_default_dict: - using_default_template = True - - elif chat_template is None: - # These are the cases when the model has a single template - # priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template - if self.chat_template is not None: - chat_template = self.chat_template - else: - chat_template = self.default_chat_template - using_default_template = True - - if using_default_template: - logger.warning_once( - "No chat template is set for this tokenizer, falling back to a default class-level template. This is " - "very error-prone, because models are often trained with templates different from the class default! " - "Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which " - "point any code depending on them will stop working. We recommend setting a valid chat template before " - "then to ensure that this model continues working without issues." - ) + chat_template = self.get_chat_template(chat_template, tools) if return_assistant_tokens_mask and not re.search(r"\{\%-?\s*generation\s*-?\%\}", chat_template): logger.warning_once( @@ -2012,6 +1965,77 @@ def activate_tracker(self, rendered_blocks: list[int], generation_indices: list[ jinja_env.globals["raise_exception"] = raise_exception return jinja_env.from_string(chat_template) + def get_chat_template(self, chat_template: Optional[str] = None, tools: Optional[List[Dict]] = None) -> str: + """ + Retrieve the chat template string used for tokenizing chat messages. This template is used + internally by the `apply_chat_template` method and can also be used externally to retrieve the model's chat + template for better generation tracking. + + Args: + chat_template (`str`, *optional*): + A Jinja template or the name of a template to use for this conversion. + It is usually not necessary to pass anything to this argument, + as the model's template will be used by default. + tools (`List[Dict]`, *optional*): + A list of tools (callable functions) that will be accessible to the model. If the template does not + support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema, + giving the name, description and argument types for the tool. See our + [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use) + for more information. + + Returns: + `str`: The chat template string. + """ + using_default_template = False + # First, handle the cases when the model has a dict of multiple templates + if isinstance(self.chat_template, dict) or ( + self.chat_template is None and isinstance(self.default_chat_template, dict) + ): + if self.chat_template is not None: + template_dict = self.chat_template + using_default_dict = False + else: + template_dict = self.default_chat_template + using_default_dict = True + if chat_template is not None and chat_template in template_dict: + # The user can pass the name of a template to the chat template argument instead of an entire template + chat_template = template_dict[chat_template] + if using_default_dict: + using_default_template = True + elif chat_template is None: + if tools is not None and "tool_use" in template_dict: + chat_template = template_dict["tool_use"] + elif "default" in template_dict: + chat_template = template_dict["default"] + else: + raise ValueError( + "This model has multiple chat templates with no default specified! Please either pass a chat " + "template or the name of the template you wish to use to the `chat_template` argument. Available " + f"template names are {sorted(template_dict.keys())}." + ) + if using_default_dict: + using_default_template = True + + elif chat_template is None: + # These are the cases when the model has a single template + # priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template + if self.chat_template is not None: + chat_template = self.chat_template + else: + chat_template = self.default_chat_template + using_default_template = True + + if using_default_template: + logger.warning_once( + "No chat template is set for this tokenizer, falling back to a default class-level template. This is " + "very error-prone, because models are often trained with templates different from the class default! " + "Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which " + "point any code depending on them will stop working. We recommend setting a valid chat template before " + "then to ensure that this model continues working without issues." + ) + + return chat_template + @property def default_chat_template(self): """ From 34b43211d782c00da6fef778dbfaff69bbf3f115 Mon Sep 17 00:00:00 2001 From: mig-mfreitas <132093787+mig-mfreitas@users.noreply.github.com> Date: Tue, 23 Jul 2024 10:07:58 +0100 Subject: [PATCH 007/200] Add YaRN and Dynamic-YaRN RoPE Scaling Methods (#30910) * Add YaRN and Dynamic-YaRN RoPE Scaling Methods YaRN (Yet another RoPE extension method) combines the NTK-By-Parts Interpolation and Attention Scaling methods, improving upon existing RoPE interpolation methods for longer context window sizes. Fine-tuned models maintain their original performance across benchmarks while enabling efficient extrapolation and transfer learning for quicker convergence, especially in compute-limited environments. We implement YaRN and Dynamic-YaRN for the following list of models: - LLaMA - Falcon - GPT-NeoX - Olmo - Persimmon - Phi - StableLM - OpenLLaMA New unit tests are added to assert YaRN's correct behavior on both short and long sequence inputs. For more details, please refer to https://arxiv.org/abs/2309.00071. Co-authored-by: Miguel Almeida * Refactor YaRN implementation for LLaMA Iterate on YaRN implementation for LLaMA and remove diff from remaining models for increased PR modularity. This commit includes the following changes: - Merge 'yarn_rope_scaling' and 'rope_scaling' dictionaries - Remove unnecessary attributes ('extrapolation_factor' and 'finetuned') from YaRN classes - Inherit 'forward' method in YaRN classes from superclass - Rename 'yarn' method to 'compute_yarn_scaling' - Extend YaRN tests with further assertions - Fix style inconsistencies Co-authored-by: Miguel Monte e Freitas * Refactor Tensor Building Logic for YaRN - Comply with the the tensor building logic introduced in #30743 - Add referencing to the optimized Attention Factor equation - Remove Dynamic YaRN for a more agile deployment Co-authored-by: mig-mfreitas * remove unwanted file --------- Co-authored-by: Miguel Almeida Co-authored-by: mig-mfreitas Co-authored-by: Joao Gante --- .../models/falcon/modeling_falcon.py | 1 - .../models/fuyu/configuration_fuyu.py | 1 - .../models/gpt_neox/configuration_gpt_neox.py | 1 - .../models/llama/configuration_llama.py | 58 ++++++++++-- .../models/llama/modeling_llama.py | 88 +++++++++++++++++++ .../models/olmo/configuration_olmo.py | 1 - src/transformers/models/olmo/modeling_olmo.py | 1 - .../persimmon/configuration_persimmon.py | 1 - .../models/phi/configuration_phi.py | 1 - .../models/stablelm/configuration_stablelm.py | 1 - tests/models/llama/test_modeling_llama.py | 23 ++++- 11 files changed, 162 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index d1050d542a2f..663582c8a72a 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -283,7 +283,6 @@ def __init__(self, config: FalconConfig): self.attention_dropout = nn.Dropout(config.attention_dropout) self.num_kv_heads = config.num_kv_heads if (self.new_decoder_architecture or not self.multi_query) else 1 - # Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Falcon def _init_rope(self): if self.config.rope_scaling is None: self.rotary_emb = FalconRotaryEmbedding( diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index ffcdd2b61750..03d2aecc02b6 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -188,7 +188,6 @@ def __init__( **kwargs, ) - # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 8e4c94692e05..944dbb5e02f0 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -154,7 +154,6 @@ def __init__( "The hidden size is not divisble by the number of attention heads! Make sure to update them!" ) - # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 1a059101e424..843731eeffc8 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -84,13 +84,22 @@ class LlamaConfig(PretrainedConfig): rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling + strategies: linear, dynamic and yarn. Their scaling factor must be a float greater than 1. The expected format is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update `max_position_embeddings` to the expected new maximum. See the following thread for more information on how these scaling strategies behave: https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an experimental feature, subject to breaking API changes in future versions. + For the `yarn` strategy, the dictionary may also contain the following fields: + `original_max_position_embeddings` (`int`, *optional*): + The original maximum sequence length. This is used to scale the RoPE embeddings. + `attention_factor` (`float`, *optional*): + The attention scaling factor. If unspecified, it defaults to `0.1 ln(s) + 1`, where `s` is the `original_max_position_embeddings/max_position_embeddings` ratio. + `beta_fast` (`float`, *optional*): + Parameter to set the boundary for extrapolation (only) in the linear ramp function. + `beta_slow` (`float`, *optional*): + Parameter to set the boundary for interpolation (only) in the linear ramp function. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -178,15 +187,52 @@ def _rope_scaling_validation(self): if self.rope_scaling is None: return - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) < 2: raise ValueError( - "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}" + "`rope_scaling` must be a dictionary with a minimum of two fields, `type` and `factor`, " + f"got {self.rope_scaling}" ) rope_scaling_type = self.rope_scaling.get("type", None) rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "yarn"]: raise ValueError( - f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic', 'yarn'], got {rope_scaling_type}" ) if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") + + if rope_scaling_type != "yarn": + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) > 6: + raise ValueError( + "`rope_scaling` with type " + f"{rope_scaling_type}" + " must be a dictionary with a maximum of six fields, `type`, `factor`," + "`original_max_position_embeddings`, `attention_factor`, `beta_fast`, `beta_slow`, " + f"got {self.rope_scaling}" + ) + original_max_position_embeddings = self.rope_scaling.get("original_max_position_embeddings", None) + attention_factor = self.rope_scaling.get("attention_factor", None) + beta_fast = self.rope_scaling.get("beta_fast", None) + beta_slow = self.rope_scaling.get("beta_slow", None) + + if original_max_position_embeddings is not None and not isinstance(original_max_position_embeddings, int): + raise ValueError( + f"`rope_scaling`'s original_max_position_embeddings field must be an int, got {original_max_position_embeddings}" + ) + if attention_factor is not None and not isinstance(attention_factor, float) or attention_factor < 0: + raise ValueError( + f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + if beta_fast is not None and not isinstance(beta_fast, float): + raise ValueError(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}") + if beta_slow is not None and not isinstance(beta_slow, float): + raise ValueError(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}") + + b_fast = beta_fast if beta_fast is not None else 32 + b_slow = beta_slow if beta_slow is not None else 1 + if b_fast < b_slow: + raise ValueError( + f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={b_fast} and beta_slow={b_slow}" + ) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 5c0c57f3effe..b624a2d92d09 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -132,6 +132,77 @@ def forward(self, x, position_ids): return cos, sin +class LlamaYarnScalingRotaryEmbedding(LlamaRotaryEmbedding): + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + scaling_factor=1, + original_max_position_embeddings=2048, + attention_factor=None, + beta_fast=32, + beta_slow=1, + device=None, + ): + super().__init__(dim, max_position_embeddings, base, device, scaling_factor) + + self.original_max_position_embeddings = original_max_position_embeddings + self.attention_factor = attention_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + + if self.attention_factor is None: + # Recommended attention factor for LLaMA models. + # For more details please refer to https://arxiv.org/pdf/2309.00071, Eq. 22. + self.attention_factor = 0.1 * math.log(scaling_factor) + 1.0 + + self.compute_yarn_scaling(device) + + # Inverse dimension formula to find the dimension based on the number of rotations + def find_correction_dim(self, num_rotations, dim, base=10000, max_position_embeddings=2048): + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) + + # Find dimension range bounds based on rotations + def find_correction_range(self, low_rot, high_rot, dim, base=10000, max_position_embeddings=2048): + low = math.floor(self.find_correction_dim(low_rot, dim, base, max_position_embeddings)) + high = math.ceil(self.find_correction_dim(high_rot, dim, base, max_position_embeddings)) + return max(low, 0), min(high, dim - 1) + + def linear_ramp_mask(self, min, max, dim): + if min == max: + max += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + def forward(self, x, position_ids=None): + # Difference to the original RoPE: applies a scaling factor computed with + # the YaRN method (NTK-by-Parts + Attn Scaling) + # x: [bs, num_attention_heads, seq_len, head_size] + cos, sin = super().forward(x, position_ids) + cos = cos * self.mscale + sin = sin * self.mscale + return cos, sin + + def compute_yarn_scaling(self, device): + pos_freqs = self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (self.scaling_factor * pos_freqs) + + low, high = self.find_correction_range( + self.beta_fast, self.beta_slow, self.dim, self.base, self.original_max_position_embeddings + ) + # Get n-dimensional rotational scaling corrected for extrapolation + inv_freq_mask = 1 - self.linear_ramp_mask(low, high, self.dim // 2).float().to(device) + inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask + + self.register_buffer("inv_freq", inv_freq) + # Get n-dimensional magnitude scaling corrected for interpolation + self.mscale = self.attention_factor + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -258,6 +329,15 @@ def _init_rope(self): else: scaling_type = self.config.rope_scaling["type"] scaling_factor = self.config.rope_scaling["factor"] + # Yarn parameters + kwargs = { + "dim": self.config.rope_scaling.get("original_max_position_embeddings", None), + "max_position_embeddings": self.config.rope_scaling.get("attention_factor", None), + "base": self.config.rope_scaling.get("beta_fast", None), + "scaling_factor": self.config.rope_scaling.get("beta_slow", None), + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if scaling_type == "linear": self.rotary_emb = LlamaLinearScalingRotaryEmbedding( self.head_dim, @@ -272,6 +352,14 @@ def _init_rope(self): scaling_factor=scaling_factor, base=self.rope_theta, ) + elif scaling_type == "yarn": + self.rotary_emb = LlamaYarnScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + **kwargs, + ) else: raise ValueError(f"Unknown RoPE scaling type {scaling_type}") diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py index a25ccd8cc09d..77a3b18e364e 100644 --- a/src/transformers/models/olmo/configuration_olmo.py +++ b/src/transformers/models/olmo/configuration_olmo.py @@ -160,7 +160,6 @@ def __init__( **kwargs, ) - # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 4fd0c9268683..59c9b3bf1b66 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -236,7 +236,6 @@ def __init__(self, config: OlmoConfig, layer_idx: Optional[int] = None): self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias) self._init_rope() - # Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Olmo def _init_rope(self): if self.config.rope_scaling is None: self.rotary_emb = OlmoRotaryEmbedding( diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index b8e02256de80..11f4c66d73e6 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -138,7 +138,6 @@ def __init__( **kwargs, ) - # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index d1e3464ee482..e54d400ae6e7 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -165,7 +165,6 @@ def __init__( **kwargs, ) - # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index abea7483a67d..c05ac9f036d6 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -164,7 +164,6 @@ def __init__( **kwargs, ) - # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index e0311b7cea4a..de7eb7e44156 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -55,6 +55,7 @@ LlamaDynamicNTKScalingRotaryEmbedding, LlamaLinearScalingRotaryEmbedding, LlamaRotaryEmbedding, + LlamaYarnScalingRotaryEmbedding, ) @@ -397,7 +398,7 @@ def test_llama_token_classification_model(self): def test_save_load_fast_init_from_base(self): pass - @parameterized.expand([("linear",), ("dynamic",)]) + @parameterized.expand([("linear",), ("dynamic",), ("yarn",)]) def test_model_rope_scaling_from_config(self, scaling_type): config, _ = self.model_tester.prepare_config_and_inputs_for_common() short_input = ids_tensor([1, 10], config.vocab_size) @@ -491,6 +492,26 @@ def test_model_rope_scaling(self): torch.testing.assert_close(ntk_sin_long, original_sin_long) self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) + # Sanity check Yarn RoPE scaling + yarn_scaling_rope = LlamaYarnScalingRotaryEmbedding( + head_dim, + max_position_embeddings=config.max_position_embeddings, + base=config.rope_theta, + scaling_factor=scaling_factor, + ).to(torch_device) + yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) + yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) + torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_short, original_cos_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_long, original_sin_long) + @require_flash_attn @require_torch_gpu @require_bitsandbytes From 1535a2c93d325e529dc9a1907f99247fdf8a58e7 Mon Sep 17 00:00:00 2001 From: Daniel Lok Date: Tue, 23 Jul 2024 17:26:00 +0800 Subject: [PATCH 008/200] Disable quick init for TapasPreTrainedModel (#32149) add attribute to model Signed-off-by: Daniel Lok --- src/transformers/models/tapas/modeling_tapas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index a06770778e71..17a01da150b5 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -724,6 +724,7 @@ class TapasPreTrainedModel(PreTrainedModel): config_class = TapasConfig base_model_prefix = "tapas" supports_gradient_checkpointing = True + _supports_param_buffer_assignment = False # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights def _init_weights(self, module): From 5a4a76edb7ac6bbc764392e89adc11adda91f3e5 Mon Sep 17 00:00:00 2001 From: bayllama <142558246+bayllama@users.noreply.github.com> Date: Tue, 23 Jul 2024 02:28:44 -0700 Subject: [PATCH 009/200] Modify resize_token_embeddings to ensure output type is same as input (#31979) * Change resize_token_embeddings to make it return same Class that is passed to it * Add explanatory comment as requested in review * Add explanatory comments for add resizing function in lxmert * Add comment for padding_idx and moving _resize_bias in lxmert to LxmertForPreTraining --------- Co-authored-by: Prashanth Sateesh Co-authored-by: Prashanth Sateesh --- src/transformers/modeling_utils.py | 13 ++++++++++++- .../models/lxmert/modeling_lxmert.py | 16 ++++++++++++++++ tests/test_modeling_common.py | 5 +++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index a20b7d941fbf..81403f524f9e 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2128,7 +2128,18 @@ def _get_resized_embeddings( else: new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :] - return new_embeddings + # Replace weights in old_embeddings and return to maintain the same embedding type. + # This ensures correct functionality when a Custom Embedding class is passed as input. + # The input and output embedding types remain consistent. (c.f. https://github.com/huggingface/transformers/pull/31979) + old_embeddings.weight.data = new_embeddings.weight.data + old_embeddings.num_embeddings = new_embeddings.weight.data.shape[0] + + # If the new number of tokens is smaller than the original `padding_idx`, the `padding_idx` + # will be set to `None` in the resized embeddings. + if old_embeddings.padding_idx is not None and (new_num_tokens - 1) < old_embeddings.padding_idx: + old_embeddings.padding_idx = None + + return old_embeddings def _get_resized_lm_head( self, old_lm_head: nn.Linear, new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py index b77b87318386..9113fc4fd0eb 100644 --- a/src/transformers/models/lxmert/modeling_lxmert.py +++ b/src/transformers/models/lxmert/modeling_lxmert.py @@ -1072,6 +1072,22 @@ def __init__(self, config): } self.visual_losses = visual_losses + def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding: + # Adding the following steps to resize bias to match the shape of resized embeddings + new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of) + self.cls.predictions.bias = self._resize_bias(self.cls.predictions.bias, new_num_tokens) + return new_embeddings + + def _resize_bias(self, bias, new_num_tokens: int): + old_num_tokens = bias.shape[0] + if new_num_tokens <= old_num_tokens: + new_bias = bias[:new_num_tokens] + else: + extra_bias = torch.zeros(new_num_tokens - old_num_tokens, device=bias.device) + new_bias = torch.cat([bias, extra_bias]) + new_bias = nn.Parameter(new_bias) + return new_bias + def resize_num_qa_labels(self, num_labels): """ Build a resized question answering linear layer Module from a provided new linear layer. Increasing the size diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index dd041188cdca..19a945aec527 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -1755,6 +1755,8 @@ def test_resize_tokens_embeddings(self): config = copy.deepcopy(original_config) model = model_class(config) model.to(torch_device) + model_embed_pre_resize = model.get_input_embeddings() + type_model_embed_pre_resize = type(model_embed_pre_resize) if self.model_tester.is_training is False: model.eval() @@ -1774,6 +1776,9 @@ def test_resize_tokens_embeddings(self): self.assertEqual(new_model_vocab_size, model_vocab_size + 10) # Check that it actually resizes the embeddings matrix self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) + # Check to make sure the type of embeddings returned post resizing is same as type of input + type_model_embed_post_resize = type(model_embed) + self.assertEqual(type_model_embed_pre_resize, type_model_embed_post_resize) # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) From 2e113422b3504fe6de821bb9911b24273b11aa9c Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 23 Jul 2024 10:42:55 +0100 Subject: [PATCH 010/200] Llama: RoPE refactor (#32135) Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- src/transformers/__init__.py | 2 + src/transformers/modeling_rope_utils.py | 451 ++++++++++++++++++ .../models/chameleon/modeling_chameleon.py | 18 +- .../models/cohere/modeling_cohere.py | 9 +- .../models/jamba/modeling_jamba.py | 2 +- .../models/jetmoe/modeling_jetmoe.py | 2 +- .../models/llama/configuration_llama.py | 105 ++-- .../models/llama/modeling_llama.py | 284 ++++++----- .../models/mistral/modeling_mistral.py | 11 +- .../models/mixtral/modeling_mixtral.py | 2 +- src/transformers/models/olmo/modeling_olmo.py | 18 +- .../models/persimmon/modeling_persimmon.py | 2 +- src/transformers/models/phi/modeling_phi.py | 2 +- src/transformers/models/phi3/modeling_phi3.py | 2 +- .../models/qwen2_moe/modeling_qwen2_moe.py | 2 +- .../models/stablelm/modeling_stablelm.py | 2 +- .../models/starcoder2/modeling_starcoder2.py | 2 +- src/transformers/utils/dummy_pt_objects.py | 3 + tests/models/llama/test_modeling_llama.py | 78 +-- tests/utils/test_modeling_rope_utils.py | 120 +++++ 20 files changed, 831 insertions(+), 286 deletions(-) create mode 100644 src/transformers/modeling_rope_utils.py create mode 100644 tests/utils/test_modeling_rope_utils.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index fe31cd3c237b..bc6e786358b6 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1295,6 +1295,7 @@ ) _import_structure["modeling_flash_attention_utils"] = [] _import_structure["modeling_outputs"] = [] + _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS"] _import_structure["modeling_utils"] = ["PreTrainedModel"] # PyTorch models structure @@ -6010,6 +6011,7 @@ WatermarkLogitsProcessor, WhisperTimeStampLogitsProcessor, ) + from .modeling_rope_utils import ROPE_INIT_FUNCTIONS from .modeling_utils import PreTrainedModel from .models.albert import ( AlbertForMaskedLM, diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py new file mode 100644 index 000000000000..33055d2bf942 --- /dev/null +++ b/src/transformers/modeling_rope_utils.py @@ -0,0 +1,451 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Optional, Tuple + +from .configuration_utils import PretrainedConfig +from .utils import is_torch_available, logging + + +logger = logging.get_logger(__name__) + + +if is_torch_available(): + import torch + + +def _compute_default_rope_parameters( + config: Optional[PretrainedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + **rope_kwargs, +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + if config is not None and len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " + f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" + ) + if len(rope_kwargs) > 0: + base = rope_kwargs["base"] + dim = rope_kwargs["dim"] + elif config is not None: + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim)) + return inv_freq, attention_factor + + +def _compute_linear_scaling_rope_parameters( + config: Optional[PretrainedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + **rope_kwargs, +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + if config is not None and len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " + f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" + ) + if len(rope_kwargs) > 0: + factor = rope_kwargs["factor"] + elif config is not None: + factor = config.rope_scaling["factor"] + + # Gets the default RoPE parameters + inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs) + + # Then applies linear scaling to the frequencies. + # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so + # applying scaling to the inverse frequencies is equivalent. + inv_freq /= factor + return inv_freq, attention_factor + + +def _compute_dynamic_ntk_parameters( + config: Optional[PretrainedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + **rope_kwargs, +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length, used to update the dynamic RoPE at inference time. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + if config is not None and len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " + f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" + ) + if len(rope_kwargs) > 0: + base = rope_kwargs["base"] + dim = rope_kwargs["dim"] + max_position_embeddings = rope_kwargs["max_position_embeddings"] + factor = rope_kwargs["factor"] + elif config is not None: + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + max_position_embeddings = config.max_position_embeddings + factor = config.rope_scaling["factor"] + + attention_factor = 1.0 # Unused in this type of RoPE + + # seq_len: default to max_position_embeddings, e.g. at init time + seq_len = seq_len if seq_len is not None else max_position_embeddings + + # Compute the inverse frequencies + base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim)) + return inv_freq, attention_factor + + +def _compute_yarn_parameters( + config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with NTK scaling. Please refer to the + [original paper](https://arxiv.org/abs/2309.00071) + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin. + """ + # No need to keep BC with yarn, unreleased when this new pattern was created. + if len(rope_kwargs) > 0: + raise ValueError( + f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}" + ) + + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + max_position_embeddings = config.max_position_embeddings + factor = config.rope_scaling["factor"] + + # Sets the attention factor as suggested in the paper + attention_factor = config.rope_scaling.get("attention_factor") + if attention_factor is None: + attention_factor = 0.1 * math.log(factor) + 1.0 + + # Optional config options + # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly) + beta_fast = config.rope_scaling.get("beta_fast") or 32 + beta_slow = config.rope_scaling.get("beta_slow") or 1 + + # Compute the inverse frequencies + def find_correction_dim(num_rotations, dim, base, max_position_embeddings): + """Inverse dimension formula to find the dimension based on the number of rotations""" + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) + + def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings): + """Find dimension range bounds based on rotations""" + low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings)) + high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings)) + return max(low, 0), min(high, dim - 1) + + def linear_ramp_mask(min, max, dim): + if min == max: + max += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (factor * pos_freqs) + + low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings) + + # Get n-dimensional rotational scaling corrected for extrapolation + inv_freq_mask = 1 - linear_ramp_mask(low, high, dim // 2).float().to(device) + inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask + + return inv_freq, attention_factor + + +def _compute_longrope_parameters( + config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with LongRoPE scaling. Please refer to the + [original implementation](https://github.com/microsoft/LongRoPE) + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin. + """ + # No need to keep BC with longrope, unreleased when this new pattern was created. + if len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got " + f"{rope_kwargs}" + ) + + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + long_factor = config.rope_scaling["long_factor"] + short_factor = config.rope_scaling["short_factor"] + factor = config.rope_scaling.get("factor") + attention_factor = config.rope_scaling.get("attention_factor") + + # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a + # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two + # values to compute the default attention scaling factor, instead of using `factor`. + if hasattr(config, "original_max_position_embeddings"): + max_position_embeddings = config.original_max_position_embeddings + expanded_max_position_embeddings = config.max_position_embeddings + factor = expanded_max_position_embeddings / max_position_embeddings + else: + max_position_embeddings = config.max_position_embeddings + expanded_max_position_embeddings = max_position_embeddings * factor + + # Sets the attention factor as suggested in the paper + if attention_factor is None: + if factor <= 1.0: + attention_factor = 1.0 + else: + attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings)) + + # Compute the inverse frequencies -- scaled based on the target sequence length + if expanded_max_position_embeddings > max_position_embeddings: + ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device) + else: + ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device) + inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim + inv_freq = 1.0 / (ext_factors * base**inv_freq_shape) + + return inv_freq, attention_factor + + +# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters +# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE +# parameterizations, as long as the callable has the same signature. +ROPE_INIT_FUNCTIONS = { + "default": _compute_default_rope_parameters, + "linear": _compute_linear_scaling_rope_parameters, + "dynamic": _compute_dynamic_ntk_parameters, + "yarn": _compute_yarn_parameters, + "longrope": _compute_longrope_parameters, +} + + +def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, optional_keys: Optional[set] = None): + """Compare the received keys in `config.rope_scaling` against the expected and optional keys""" + missing_keys = required_keys - received_keys + if missing_keys: + raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}") + + if optional_keys is not None: + unused_keys = received_keys - required_keys - optional_keys + else: + unused_keys = received_keys - received_keys + if unused_keys: + raise KeyError(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}") + + +def _validate_default_rope_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys) + + +def _validate_linear_scaling_rope_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type", "factor"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + +def _validate_yarn_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type", "factor"} + optional_keys = {"attention_factor", "beta_fast", "beta_slow"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, optional_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + attention_factor = rope_scaling.get("attention_factor") + if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0): + raise ValueError( + f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + beta_fast = rope_scaling.get("beta_fast") + if beta_fast is not None and not isinstance(beta_fast, float): + raise ValueError(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}") + beta_slow = rope_scaling.get("beta_slow") + if beta_slow is not None and not isinstance(beta_slow, float): + raise ValueError(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}") + + if (beta_fast or 32) < (beta_slow or 1): + raise ValueError( + f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} " + f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)" + ) + + +def _validate_longrope_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type", "short_factor", "long_factor"} + optional_keys = {"attention_factor", "factor"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, optional_keys) + + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + + short_factor = rope_scaling.get("short_factor") + if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor): + raise ValueError(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}") + if not len(short_factor) == dim // 2: + raise ValueError(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}") + + long_factor = rope_scaling.get("long_factor") + if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor): + raise ValueError(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}") + if not len(long_factor) == dim // 2: + raise ValueError(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}") + + # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over + # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is + # unique to longrope (= undesirable) + if hasattr(config, "original_max_position_embeddings"): + logger.warning_once( + "This model has set a `original_max_position_embeddings` field, to be used together with " + "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`" + "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, " + "as it is compatible with most model architectures." + ) + else: + factor = rope_scaling.get("factor") + if factor is None: + raise ValueError("Missing required keys in `rope_scaling`: 'factor'") + elif not isinstance(factor, float) or factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + attention_factor = rope_scaling.get("attention_factor") + if attention_factor is not None and not isinstance(attention_factor, float) or attention_factor < 0: + raise ValueError( + f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + + +# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types. +ROPE_VALIDATION_FUNCTIONS = { + "default": _validate_default_rope_parameters, + "linear": _validate_linear_scaling_rope_parameters, + "dynamic": _validate_linear_scaling_rope_parameters, # `dynamic` has the same validation pattern as `linear` + "yarn": _validate_yarn_parameters, + "longrope": _validate_longrope_parameters, +} + + +def rope_config_validation(config: PretrainedConfig): + """ + Validate the RoPE config arguments, given a `PretrainedConfig` object + """ + rope_scaling = getattr(config, "rope_scaling", None) # not a default parameter in `PretrainedConfig` + if rope_scaling is None: + return + + possible_rope_types = set(ROPE_INIT_FUNCTIONS.keys()) + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" + if rope_type is None: + raise ValueError( + f"rope_scaling must contain a non-None 'rope_type' field. Possible options are {possible_rope_types}" + ) + + validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type) + if validation_fn is not None: + validation_fn(config) + else: + raise ValueError( + f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'" + ) diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 1eea9b224958..cd10850ae36b 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -80,7 +80,8 @@ def forward(self, hidden_states): ALL_LAYERNORM_LAYERS.append(ChameleonRMSNorm) -# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonRotaryEmbedding(nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): super().__init__() @@ -110,7 +111,8 @@ def forward(self, x, position_ids): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonLinearScalingRotaryEmbedding(ChameleonRotaryEmbedding): """ChameleonRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" @@ -121,7 +123,8 @@ def forward(self, x, position_ids): return cos, sin -# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonDynamicNTKScalingRotaryEmbedding(ChameleonRotaryEmbedding): """ChameleonRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" @@ -265,7 +268,8 @@ def __init__(self, config: ChameleonConfig, layer_idx: Optional[int] = None): self.k_norm = ChameleonLayerNorm((self.num_key_value_heads, self.head_dim)) self._init_rope() - # Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Chameleon + # copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Chameleon + # TODO(joao): add me back asap :) def _init_rope(self): if self.config.rope_scaling is None: self.rotary_emb = ChameleonRotaryEmbedding( @@ -358,7 +362,8 @@ def forward( return attn_output, attn_weights, past_key_value -# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonFlashAttention2(ChameleonAttention): """ Chameleon flash attention module. This module inherits from `ChameleonAttention` as the weights of the module stays @@ -576,7 +581,8 @@ def forward( } -# Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Chameleon, LLAMA->CHAMELEON +# copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Chameleon, LLAMA->CHAMELEON +# TODO(joao): add me back asap :) class ChameleonDecoderLayer(nn.Module): def __init__(self, config: ChameleonConfig, layer_idx: int): super().__init__() diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 5322c2334d37..6532c656d453 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -295,7 +295,8 @@ def forward( return attn_output, attn_weights, past_key_value -# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere +# copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere +# TODO(joao): add me back asap :) class CohereFlashAttention2(CohereAttention): """ Cohere flash attention module. This module inherits from `CohereAttention` as the weights of the module stays @@ -409,7 +410,8 @@ def forward( return attn_output, attn_weights, past_key_value -# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention Llama->Cohere +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention Llama->Cohere +# TODO(joao): add me back asap :) class CohereSdpaAttention(CohereAttention): """ Cohere attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from @@ -697,7 +699,8 @@ def _init_weights(self, module): "The bare Cohere Model outputting raw hidden-states without any specific head on top.", COHERE_START_DOCSTRING, ) -# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere +# copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere +# TODO(joao): add me back asap :) class CohereModel(CoherePreTrainedModel): """ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`] diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index 28d732628a28..6a03dc82a698 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -1624,7 +1624,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(JAMBA_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index 16d8335e0a52..fa15393a40a5 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -1363,7 +1363,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(JETMOE_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 843731eeffc8..7c987ec85a04 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -20,10 +20,7 @@ """LLaMA model configuration""" from ...configuration_utils import PretrainedConfig -from ...utils import logging - - -logger = logging.get_logger(__name__) +from ...modeling_rope_utils import rope_config_validation class LlamaConfig(PretrainedConfig): @@ -84,22 +81,35 @@ class LlamaConfig(PretrainedConfig): rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling - strategies: linear, dynamic and yarn. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. - For the `yarn` strategy, the dictionary may also contain the following fields: - `original_max_position_embeddings` (`int`, *optional*): - The original maximum sequence length. This is used to scale the RoPE embeddings. + Dictionary containing the scaling configuration for the RoPE embeddings. IMPORTANT: RoPE scaling expects + `max_position_embeddings` to remain unchanged -- some methods, like 'longrope', require the original value + to determine which scaling to apply. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope'], + with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + `max_position_embeddings`. `attention_factor` (`float`, *optional*): - The attention scaling factor. If unspecified, it defaults to `0.1 ln(s) + 1`, where `s` is the `original_max_position_embeddings/max_position_embeddings` ratio. + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. `beta_fast` (`float`, *optional*): - Parameter to set the boundary for extrapolation (only) in the linear ramp function. + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. `beta_slow` (`float`, *optional*): - Parameter to set the boundary for interpolation (only) in the linear ramp function. + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `max_position_embeddings` * `factor`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `max_position_embeddings` * `factor`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -167,11 +177,13 @@ def __init__( self.use_cache = use_cache self.rope_theta = rope_theta self.rope_scaling = rope_scaling - self._rope_scaling_validation() self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias + # Validate the correctness of rotary position embeddings parameters + rope_config_validation(self) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -179,60 +191,3 @@ def __init__( tie_word_embeddings=tie_word_embeddings, **kwargs, ) - - def _rope_scaling_validation(self): - """ - Validate the `rope_scaling` configuration. - """ - if self.rope_scaling is None: - return - - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) < 2: - raise ValueError( - "`rope_scaling` must be a dictionary with a minimum of two fields, `type` and `factor`, " - f"got {self.rope_scaling}" - ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "yarn"]: - raise ValueError( - f"`rope_scaling`'s type field must be one of ['linear', 'dynamic', 'yarn'], got {rope_scaling_type}" - ) - if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") - - if rope_scaling_type != "yarn": - return - - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) > 6: - raise ValueError( - "`rope_scaling` with type " - f"{rope_scaling_type}" - " must be a dictionary with a maximum of six fields, `type`, `factor`," - "`original_max_position_embeddings`, `attention_factor`, `beta_fast`, `beta_slow`, " - f"got {self.rope_scaling}" - ) - original_max_position_embeddings = self.rope_scaling.get("original_max_position_embeddings", None) - attention_factor = self.rope_scaling.get("attention_factor", None) - beta_fast = self.rope_scaling.get("beta_fast", None) - beta_slow = self.rope_scaling.get("beta_slow", None) - - if original_max_position_embeddings is not None and not isinstance(original_max_position_embeddings, int): - raise ValueError( - f"`rope_scaling`'s original_max_position_embeddings field must be an int, got {original_max_position_embeddings}" - ) - if attention_factor is not None and not isinstance(attention_factor, float) or attention_factor < 0: - raise ValueError( - f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" - ) - if beta_fast is not None and not isinstance(beta_fast, float): - raise ValueError(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}") - if beta_slow is not None and not isinstance(beta_slow, float): - raise ValueError(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}") - - b_fast = beta_fast if beta_fast is not None else 32 - b_slow = beta_slow if beta_slow is not None else 1 - if b_fast < b_slow: - raise ValueError( - f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={b_fast} and beta_slow={b_slow}" - ) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index b624a2d92d09..3115cee78f76 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -37,6 +37,7 @@ SequenceClassifierOutputWithPast, TokenClassifierOutput, ) +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel from ...pytorch_utils import ALL_LAYERNORM_LAYERS from ...utils import ( @@ -75,24 +76,77 @@ def forward(self, hidden_states): class LlamaRotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + def __init__( + self, + dim=None, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + rope_type="default", + config: Optional[LlamaConfig] = None, + ): super().__init__() - self.scaling_factor = scaling_factor - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + # TODO (joao): remove the `if` below, only used for BC + self.rope_kwargs = {} + if config is None: + logger.warning_once( + "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the " + "`config` argument. All other arguments will be removed in v4.45" + ) + self.rope_kwargs = { + "rope_type": rope_type, + "factor": scaling_factor, + "dim": dim, + "base": base, + "max_position_embeddings": max_position_embeddings, + } + self.rope_type = rope_type + self.max_seq_len_cached = max_position_embeddings + self.original_max_seq_len = max_position_embeddings + else: + # BC: "rope_type" was originally "type" + if config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling["type"]) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs) self.register_buffer("inv_freq", inv_freq, persistent=False) - # For BC we register cos and sin cached - self.max_seq_len_cached = max_position_embeddings + self.original_inv_freq = self.inv_freq + + def _dynamic_frequency_update(self, position_ids, device): + """ + dynamic RoPE layers should recompute `inv_freq` in the following situations: + 1 - growing beyond the cached sequence length (allow scaling) + 2 - the current sequence length is in the original scale (avoid losing precision with small sequences) + """ + seq_len = torch.max(position_ids) + 1 + if seq_len > self.max_seq_len_cached: # growth + inv_freq, self.attention_scaling = self.rope_init_fn( + self.config, device, seq_len=seq_len, **self.rope_kwargs + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation + self.max_seq_len_cached = seq_len + + if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset + self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) + self.max_seq_len_cached = self.original_max_seq_len @torch.no_grad() def forward(self, x, position_ids): - # x: [bs, num_attention_heads, seq_len, head_size] + if "dynamic" in self.rope_type: + self._dynamic_frequency_update(position_ids, device=x.device) + + # Core RoPE block inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) position_ids_expanded = position_ids[:, None, :].float() - # Force float32 since bfloat16 loses precision on long contexts - # See https://github.com/huggingface/transformers/pull/29285 + # Force float32 (see https://github.com/huggingface/transformers/pull/29285) device_type = x.device.type device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" with torch.autocast(device_type=device_type, enabled=False): @@ -100,107 +154,37 @@ def forward(self, x, position_ids): emb = torch.cat((freqs, freqs), dim=-1) cos = emb.cos() sin = emb.sin() + + # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention + cos = cos * self.attention_scaling + sin = sin * self.attention_scaling + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding): """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" - def forward(self, x, position_ids): - # difference to the original RoPE: a scaling factor is aplied to the position ids - position_ids = position_ids.float() / self.scaling_factor - cos, sin = super().forward(x, position_ids) - return cos, sin + def __init__(self, *args, **kwargs): + logger.warning_once( + "`LlamaLinearScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use " + "`LlamaRotaryEmbedding`, which now also does linear scaling (simply pass the model config to __init__)." + ) + kwargs["rope_type"] = "linear" + super().__init__(*args, **kwargs) class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding): """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" - def forward(self, x, position_ids): - # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length - seq_len = torch.max(position_ids) + 1 - if seq_len > self.max_position_embeddings: - base = self.base * ( - (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) - ) ** (self.dim / (self.dim - 2)) - inv_freq = 1.0 / ( - base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(x.device) / self.dim) - ) - self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: this may break with compilation - - cos, sin = super().forward(x, position_ids) - return cos, sin - - -class LlamaYarnScalingRotaryEmbedding(LlamaRotaryEmbedding): - def __init__( - self, - dim, - max_position_embeddings=2048, - base=10000, - scaling_factor=1, - original_max_position_embeddings=2048, - attention_factor=None, - beta_fast=32, - beta_slow=1, - device=None, - ): - super().__init__(dim, max_position_embeddings, base, device, scaling_factor) - - self.original_max_position_embeddings = original_max_position_embeddings - self.attention_factor = attention_factor - self.beta_fast = beta_fast - self.beta_slow = beta_slow - - if self.attention_factor is None: - # Recommended attention factor for LLaMA models. - # For more details please refer to https://arxiv.org/pdf/2309.00071, Eq. 22. - self.attention_factor = 0.1 * math.log(scaling_factor) + 1.0 - - self.compute_yarn_scaling(device) - - # Inverse dimension formula to find the dimension based on the number of rotations - def find_correction_dim(self, num_rotations, dim, base=10000, max_position_embeddings=2048): - return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) - - # Find dimension range bounds based on rotations - def find_correction_range(self, low_rot, high_rot, dim, base=10000, max_position_embeddings=2048): - low = math.floor(self.find_correction_dim(low_rot, dim, base, max_position_embeddings)) - high = math.ceil(self.find_correction_dim(high_rot, dim, base, max_position_embeddings)) - return max(low, 0), min(high, dim - 1) - - def linear_ramp_mask(self, min, max, dim): - if min == max: - max += 0.001 # Prevent singularity - - linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) - ramp_func = torch.clamp(linear_func, 0, 1) - return ramp_func - - def forward(self, x, position_ids=None): - # Difference to the original RoPE: applies a scaling factor computed with - # the YaRN method (NTK-by-Parts + Attn Scaling) - # x: [bs, num_attention_heads, seq_len, head_size] - cos, sin = super().forward(x, position_ids) - cos = cos * self.mscale - sin = sin * self.mscale - return cos, sin - - def compute_yarn_scaling(self, device): - pos_freqs = self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) - inv_freq_extrapolation = 1.0 / pos_freqs - inv_freq_interpolation = 1.0 / (self.scaling_factor * pos_freqs) - - low, high = self.find_correction_range( - self.beta_fast, self.beta_slow, self.dim, self.base, self.original_max_position_embeddings + def __init__(self, *args, **kwargs): + logger.warning_once( + "`LlamaDynamicNTKScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use " + "`LlamaRotaryEmbedding`, which now also does dynamic ntk scaling (simply pass the model config to " + "__init__)." ) - # Get n-dimensional rotational scaling corrected for extrapolation - inv_freq_mask = 1 - self.linear_ramp_mask(low, high, self.dim // 2).float().to(device) - inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask - - self.register_buffer("inv_freq", inv_freq) - # Get n-dimensional magnitude scaling corrected for interpolation - self.mscale = self.attention_factor + kwargs["rope_type"] = "dynamic" + super().__init__(*args, **kwargs) def rotate_half(x): @@ -317,51 +301,9 @@ def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None): self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias) - self._init_rope() - - def _init_rope(self): - if self.config.rope_scaling is None: - self.rotary_emb = LlamaRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - else: - scaling_type = self.config.rope_scaling["type"] - scaling_factor = self.config.rope_scaling["factor"] - # Yarn parameters - kwargs = { - "dim": self.config.rope_scaling.get("original_max_position_embeddings", None), - "max_position_embeddings": self.config.rope_scaling.get("attention_factor", None), - "base": self.config.rope_scaling.get("beta_fast", None), - "scaling_factor": self.config.rope_scaling.get("beta_slow", None), - } - kwargs = {k: v for k, v in kwargs.items() if v is not None} - - if scaling_type == "linear": - self.rotary_emb = LlamaLinearScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - elif scaling_type == "dynamic": - self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - elif scaling_type == "yarn": - self.rotary_emb = LlamaYarnScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - **kwargs, - ) - else: - raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + # TODO (joao): remove in v4.45 (RoPE is computed in the model, not in the decoder layers) + self.rotary_emb = LlamaRotaryEmbedding(config=self.config) def forward( self, @@ -372,6 +314,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() @@ -402,7 +345,16 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -471,6 +423,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if isinstance(past_key_value, StaticCache): raise ValueError( @@ -493,7 +446,16 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -573,6 +535,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if output_attentions: @@ -589,6 +552,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, ) bsz, q_len, _ = hidden_states.size() @@ -601,7 +565,16 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -671,6 +644,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 **kwargs, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -688,6 +662,9 @@ def forward( past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): Indices depicting the position of the input sequence tokens in the sequence + position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. kwargs (`dict`, *optional*): Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code into the model @@ -705,6 +682,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) hidden_states = residual + hidden_states @@ -867,6 +845,7 @@ def __init__(self, config: LlamaConfig): [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = LlamaRotaryEmbedding(config=config) self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -933,10 +912,11 @@ def forward( causal_mask = self._update_causal_mask( attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions ) - - # embed positions hidden_states = inputs_embeds + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None @@ -956,6 +936,7 @@ def forward( output_attentions, use_cache, cache_position, + position_embeddings, ) else: layer_outputs = decoder_layer( @@ -966,6 +947,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, ) hidden_states = layer_outputs[0] @@ -1280,7 +1262,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 8e2f4dd5a44a..dd814cd75fb1 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -85,7 +85,8 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): self.register_buffer("inv_freq", inv_freq, persistent=False) @torch.no_grad() - # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.forward + # copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.forward + # TODO(joao): add me back asap :) def forward(self, x, position_ids): # x: [bs, num_attention_heads, seq_len, head_size] inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) @@ -396,7 +397,8 @@ def forward( return attn_output, attn_weights, past_key_value -# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral +# TODO(joao): add me back asap :) class MistralSdpaAttention(MistralAttention): """ Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from @@ -492,7 +494,8 @@ def forward( } -# Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Mistral, LLAMA->MISTRAL +# copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Mistral, LLAMA->MISTRAL +# TODO(joao): add me back asap :) class MistralDecoderLayer(nn.Module): def __init__(self, config: MistralConfig, layer_idx: int): super().__init__() @@ -1146,7 +1149,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 4b88afcded37..82320de79386 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -1362,7 +1362,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 59c9b3bf1b66..a56baf0653ec 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -74,7 +74,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: ALL_LAYERNORM_LAYERS.append(OlmoLayerNorm) -# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmo +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmo +# TODO(joao): add me back asap :) class OlmoRotaryEmbedding(nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): super().__init__() @@ -104,7 +105,8 @@ def forward(self, x, position_ids): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Olmo +# copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Olmo +# TODO(joao): add me back asap :) class OlmoLinearScalingRotaryEmbedding(OlmoRotaryEmbedding): """OlmoRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" @@ -115,7 +117,8 @@ def forward(self, x, position_ids): return cos, sin -# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Olmo +# copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Olmo +# TODO(joao): add me back asap :) class OlmoDynamicNTKScalingRotaryEmbedding(OlmoRotaryEmbedding): """OlmoRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" @@ -202,7 +205,8 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: class OlmoAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" - # Copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ with Llama->Olmo + # copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ with Llama->Olmo + # TODO(joao): add me back asap :) def __init__(self, config: OlmoConfig, layer_idx: Optional[int] = None): super().__init__() self.config = config @@ -549,7 +553,8 @@ def __init__(self, config: OlmoConfig, layer_idx: int): self.input_layernorm = OlmoLayerNorm(config.hidden_size) self.post_attention_layernorm = OlmoLayerNorm(config.hidden_size) - # Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer.forward + # copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer.forward + # TODO(joao): add me back asap :) def forward( self, hidden_states: torch.Tensor, @@ -768,7 +773,8 @@ def set_input_embeddings(self, value): self.embed_tokens = value @add_start_docstrings_to_model_forward(OLMO_INPUTS_DOCSTRING) - # Copied from transformers.models.llama.modeling_llama.LlamaModel.forward + # copied from transformers.models.llama.modeling_llama.LlamaModel.forward + # TODO(joao): add me back asap :) def forward( self, input_ids: torch.LongTensor = None, diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index fc1b729fa654..af22145e3e9d 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -999,7 +999,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(PERSIMMON_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 7ad34a578083..f80453d3f7d9 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -1282,7 +1282,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index b7d05bbed6ca..a32f8531e487 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -1278,7 +1278,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index c20d74fb18c4..d88b5c357e86 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -1370,7 +1370,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index a17218361802..ea50a20edea8 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -1275,7 +1275,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(STABLELM_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index 430befd24ae3..af532b139ca3 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -1153,7 +1153,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 81d4c2105586..de739c6e7004 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -485,6 +485,9 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +ROPE_INIT_FUNCTIONS = None + + class PreTrainedModel(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index de7eb7e44156..85d352fc814f 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -51,12 +51,7 @@ LlamaModel, LlamaTokenizer, ) - from transformers.models.llama.modeling_llama import ( - LlamaDynamicNTKScalingRotaryEmbedding, - LlamaLinearScalingRotaryEmbedding, - LlamaRotaryEmbedding, - LlamaYarnScalingRotaryEmbedding, - ) + from transformers.models.llama.modeling_llama import LlamaLinearScalingRotaryEmbedding, LlamaRotaryEmbedding class LlamaModelTester: @@ -431,9 +426,6 @@ def test_model_rope_scaling_from_config(self, scaling_type): def test_model_rope_scaling(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() - hidden_size = config.hidden_size - num_heads = config.num_attention_heads - head_dim = hidden_size // num_heads scaling_factor = 10 short_input_length = 10 long_input_length = int(config.max_position_embeddings * 1.5) @@ -446,11 +438,7 @@ def test_model_rope_scaling(self): position_ids_long = position_ids_long.unsqueeze(0) # Sanity check original RoPE - original_rope = LlamaRotaryEmbedding( - head_dim, - max_position_embeddings=config.max_position_embeddings, - base=config.rope_theta, - ).to(torch_device) + original_rope = LlamaRotaryEmbedding(config=config).to(torch_device) original_cos_short, original_sin_short = original_rope(x, position_ids_short) original_cos_long, original_sin_long = original_rope(x, position_ids_long) torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) @@ -458,12 +446,8 @@ def test_model_rope_scaling(self): # Sanity check linear RoPE scaling # New position "x" should match original position with index "x/scaling_factor" - linear_scaling_rope = LlamaLinearScalingRotaryEmbedding( - head_dim, - max_position_embeddings=config.max_position_embeddings, - base=config.rope_theta, - scaling_factor=scaling_factor, - ).to(torch_device) + config.rope_scaling = {"type": "linear", "factor": scaling_factor} + linear_scaling_rope = LlamaRotaryEmbedding(config=config).to(torch_device) linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) @@ -476,12 +460,8 @@ def test_model_rope_scaling(self): # Sanity check Dynamic NTK RoPE scaling # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase # with scaling_factor (or that `inv_freq` decreases) - ntk_scaling_rope = LlamaDynamicNTKScalingRotaryEmbedding( - head_dim, - max_position_embeddings=config.max_position_embeddings, - base=config.rope_theta, - scaling_factor=scaling_factor, - ).to(torch_device) + config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} + ntk_scaling_rope = LlamaRotaryEmbedding(config=config).to(torch_device) ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) torch.testing.assert_close(ntk_cos_short, original_cos_short) @@ -493,12 +473,9 @@ def test_model_rope_scaling(self): self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) # Sanity check Yarn RoPE scaling - yarn_scaling_rope = LlamaYarnScalingRotaryEmbedding( - head_dim, - max_position_embeddings=config.max_position_embeddings, - base=config.rope_theta, - scaling_factor=scaling_factor, - ).to(torch_device) + # Scaling should be over the entire input + config.rope_scaling = {"type": "yarn", "factor": scaling_factor} + yarn_scaling_rope = LlamaRotaryEmbedding(config=config).to(torch_device) yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) @@ -512,6 +489,43 @@ def test_model_rope_scaling(self): with self.assertRaises(AssertionError): torch.testing.assert_close(yarn_sin_long, original_sin_long) + def test_rope_class_retrocompatibility(self): + # Delete me when we remove compatibility for the old API :) + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + scaling_factor = 10 + short_input_length = 10 + long_input_length = int(config.max_position_embeddings * 1.5) + config.rope_scaling = {"type": "linear", "factor": 10} + + # Inputs + x = torch.randn(1, dtype=torch.float32, device=torch_device) # used exlusively to get the dtype and the device + position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) + position_ids_short = position_ids_short.unsqueeze(0) + position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) + position_ids_long = position_ids_long.unsqueeze(0) + + # Old API -- under the hood, "type": "linear" is set and `LlamaRotaryEmbedding` is called + old_api_rope = LlamaLinearScalingRotaryEmbedding( + config.hidden_size // config.num_attention_heads, + max_position_embeddings=config.max_position_embeddings, + base=config.rope_theta, + scaling_factor=scaling_factor, + ).to(torch_device) + old_cos_short, old_sin_short = old_api_rope(x, position_ids_short) + old_cos_long, old_sin_long = old_api_rope(x, position_ids_long) + + # New API + config.rope_scaling = {"type": "linear", "factor": scaling_factor} + new_api_rope = LlamaRotaryEmbedding(config=config).to(torch_device) + new_cos_short, new_sin_short = new_api_rope(x, position_ids_short) + new_cos_long, new_sin_long = new_api_rope(x, position_ids_long) + + # The results should match + torch.testing.assert_close(old_cos_short, new_cos_short) + torch.testing.assert_close(old_sin_short, new_sin_short) + torch.testing.assert_close(old_cos_long, new_cos_long) + torch.testing.assert_close(old_sin_long, new_sin_long) + @require_flash_attn @require_torch_gpu @require_bitsandbytes diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py new file mode 100644 index 000000000000..847323d9bf23 --- /dev/null +++ b/tests/utils/test_modeling_rope_utils.py @@ -0,0 +1,120 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers import LlamaConfig +from transformers.testing_utils import is_torch_available, require_torch, torch_device + + +if is_torch_available(): + import torch + + from transformers import ROPE_INIT_FUNCTIONS + from transformers.modeling_rope_utils import rope_config_validation + + +@require_torch +class RopeTest(unittest.TestCase): + def test_rope_validation(self): + config = LlamaConfig() + all_rope_types = ROPE_INIT_FUNCTIONS.keys() + + # The base config is always valid (default RoPE) + rope_config_validation(config) + + # If we explicitly set the other RoPE types, then validation should fail + for rope_type in all_rope_types: + if rope_type != "default": + config.rope_scaling = {"rope_type": rope_type} + with self.assertRaises(KeyError): + rope_config_validation(config) + + # Parameters are exclusive to their own RoPE type, and should raise an exception if incorrectly passed + valid_param_mapping = { + "factor": ["linear", "dynamic", "yarn", "longrope"], + "attention_factor": ["yarn", "longrope"], + "beta_fast": ["yarn"], + "beta_slow": ["yarn"], + "short_factor": ["longrope"], + "long_factor": ["longrope"], + } + for rope_type in all_rope_types: + if rope_type == "default": + continue # checked above + for param, valid_rope_types in valid_param_mapping.items(): + # Set `param` with a dummy value -- we want to test the dict key + config.rope_scaling = {"rope_type": rope_type, param: True} + if rope_type in valid_rope_types: + continue + else: + with self.assertRaises(KeyError): + rope_config_validation(config) + + def test_default_rope_function_bc(self): + config = LlamaConfig() + device = torch_device + + rope_kwargs = { + "rope_type": "default", + "dim": config.hidden_size // config.num_attention_heads, + "max_position_embeddings": config.max_position_embeddings, + "base": config.rope_theta, + } + + rope_fn = ROPE_INIT_FUNCTIONS["default"] + config_freqs = rope_fn(config=config, device=device)[0] + kwargs_freqs = rope_fn(**rope_kwargs, device=device)[0] + torch.testing.assert_close(config_freqs, kwargs_freqs) + + def test_linear_rope_function_bc(self): + config = LlamaConfig() + config.rope_scaling = {"rope_type": "linear", "factor": 10.0} + device = torch_device + + rope_kwargs = { + "rope_type": "linear", + "dim": config.hidden_size // config.num_attention_heads, + "max_position_embeddings": config.max_position_embeddings, + "base": config.rope_theta, + "factor": 10.0, + } + + rope_fn = ROPE_INIT_FUNCTIONS["linear"] + config_freqs = rope_fn(config=config, device=device)[0] + kwargs_freqs = rope_fn(**rope_kwargs, device=device)[0] + torch.testing.assert_close(config_freqs, kwargs_freqs) + + def test_dynamic_rope_function_bc(self): + config = LlamaConfig() + config.rope_scaling = {"rope_type": "dynamic", "factor": 10.0} + device = torch_device + + rope_kwargs = { + "rope_type": "dynamic", + "dim": config.hidden_size // config.num_attention_heads, + "max_position_embeddings": config.max_position_embeddings, + "base": config.rope_theta, + "factor": 10.0, + } + + rope_fn = ROPE_INIT_FUNCTIONS["dynamic"] + config_freqs = rope_fn(config=config, device=device)[0] + kwargs_freqs = rope_fn(**rope_kwargs, device=device)[0] + torch.testing.assert_close(config_freqs, kwargs_freqs) + + +# TODO(joao): numerical checks for the different RoPE fns From a1844a3209eb7e75582684809203bc189931a90c Mon Sep 17 00:00:00 2001 From: Ita Zaporozhets <31893021+itazap@users.noreply.github.com> Date: Tue, 23 Jul 2024 11:45:54 +0200 Subject: [PATCH 011/200] gguf conversion add_prefix_space=None for llama3 (#31937) * gguf conversion forces add_prefix_space=False for llama3, this is not required and forces from_slow, which fails. changing to None + test * typo * clean test --- src/transformers/integrations/ggml.py | 2 +- tests/quantization/ggml/test_ggml.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 5c2d72c345ec..71aa87afa94b 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -609,7 +609,7 @@ def tokenizer(self, proto): self.additional_kwargs["bos_token"] = eos_token if self.is_llama_3_tokenizer: - self.additional_kwargs["add_prefix_space"] = False + self.additional_kwargs["add_prefix_space"] = None self.additional_kwargs["clean_up_tokenization_spaces"] = True self.additional_kwargs["legacy"] = False diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index db96e9052c5f..a5866094a1cc 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -174,10 +174,13 @@ def test_qwen2_q4_0(self): self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) def test_llama3_q4_0_tokenizer(self): - tokenizer_gguf = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id) - special_sentence = "สวัสดี" - predicted_text = tokenizer_gguf.decode(tokenizer_gguf.encode(special_sentence, return_tensors="pt")[0]) - self.assertEqual(predicted_text, "<|begin_of_text|>" + special_sentence) + tokenizer = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id) + with tempfile.TemporaryDirectory() as tmpdirname: + tokenizer.save_pretrained(tmpdirname) + tokenizer = AutoTokenizer.from_pretrained(tmpdirname) + special_sentence = "สวัสดี" + predicted_text = tokenizer.decode(tokenizer.encode(special_sentence, return_tensors="pt")[0]) + self.assertEqual(predicted_text, "<|begin_of_text|>" + special_sentence) def test_llama3_q4_0(self): tokenizer = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id) From a5b226ce9811aa6b31af0bc9c09c54493a4e67c1 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Tue, 23 Jul 2024 12:21:23 +0200 Subject: [PATCH 012/200] Fix flash attention speed issue (#32028) Add the lru_cache for speed --- src/transformers/utils/import_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index f81b9d3dba41..a5ea4eb1850c 100755 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -820,6 +820,7 @@ def is_flash_attn_greater_or_equal_2_10(): return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.1.0") +@lru_cache() def is_flash_attn_greater_or_equal(library_version: str): if not _is_package_available("flash_attn"): return False From 9ced33ca7f909d9ace743dac083daba99c904d46 Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Tue, 23 Jul 2024 13:23:23 +0300 Subject: [PATCH 013/200] Fix video batching to videollava (#32139) --------- Co-authored-by: Merve Noyan --- .../image_processing_video_llava.py | 7 ++- .../test_image_processing_video_llava.py | 43 ++++++++++++++----- 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py index 82ac5869c017..943c2fe51a0e 100644 --- a/src/transformers/models/video_llava/image_processing_video_llava.py +++ b/src/transformers/models/video_llava/image_processing_video_llava.py @@ -55,8 +55,11 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]) and len(videos[0].shape) == 4: - return [list(video) for video in videos] + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): + if isinstance(videos[0], PIL.Image.Image): + return [videos] + elif len(videos[0].shape) == 4: + return [list(video) for video in videos] elif is_valid_image(videos) and len(videos.shape) == 4: return [list(videos)] diff --git a/tests/models/video_llava/test_image_processing_video_llava.py b/tests/models/video_llava/test_image_processing_video_llava.py index 4a5c2516267e..03cfb033ffb9 100644 --- a/tests/models/video_llava/test_image_processing_video_llava.py +++ b/tests/models/video_llava/test_image_processing_video_llava.py @@ -97,8 +97,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F torchify=torchify, ) - def prepare_video_inputs(self, equal_resolution=False, torchify=False): - numpify = not torchify + def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False): images = prepare_image_inputs( batch_size=self.batch_size, num_channels=self.num_channels, @@ -108,15 +107,19 @@ def prepare_video_inputs(self, equal_resolution=False, torchify=False): numpify=numpify, torchify=torchify, ) - # let's simply copy the frames to fake a long video-clip - videos = [] - for image in images: - if numpify: - video = image[None, ...].repeat(8, 0) - else: - video = image[None, ...].repeat(8, 1, 1, 1) - videos.append(video) + if numpify or torchify: + videos = [] + for image in images: + if numpify: + video = image[None, ...].repeat(8, 0) + else: + video = image[None, ...].repeat(8, 1, 1, 1) + videos.append(video) + else: + videos = [] + for pil_image in images: + videos.append([pil_image] * 8) return videos @@ -197,7 +200,7 @@ def test_call_numpy_videos(self): # Initialize image_processing image_processing = self.image_processing_class(**self.image_processor_dict) # create random numpy tensors - video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True) + video_inputs = self.image_processor_tester.prepare_video_inputs(numpify=True, equal_resolution=True) for video in video_inputs: self.assertIsInstance(video, np.ndarray) @@ -211,6 +214,24 @@ def test_call_numpy_videos(self): expected_output_video_shape = (5, 8, 3, 18, 18) self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + def test_call_pil_videos(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # the inputs come in list of lists batched format + video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True) + for video in video_inputs: + self.assertIsInstance(video[0], Image.Image) + + # Test not batched input + encoded_videos = image_processing(images=None, videos=video_inputs[0], return_tensors="pt").pixel_values_videos + expected_output_video_shape = (1, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=None, videos=video_inputs, return_tensors="pt").pixel_values_videos + expected_output_video_shape = (5, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + def test_call_pytorch(self): # Initialize image_processing image_processing = self.image_processing_class(**self.image_processor_dict) From bab32d6fe932a3372fbd6d5a84e3cacb12a61ae0 Mon Sep 17 00:00:00 2001 From: Alexandre TL Date: Tue, 23 Jul 2024 12:32:19 +0200 Subject: [PATCH 014/200] Added mamba.py backend (#30139) * Update README.md * tests: forward ok * backward test done * done testing * removed check. scripts * Update README.md * added use_mambapy arg * fixed typo in warning * protected imports w/ mambapy package * delete pscan.py + raise rather than assert * Update import_utils.py * fix whitespaces and unused import * trailing whitespace + import block unformatted * Update modeling_mamba.py * transpose before pscan * shape comment * ran make style * use_mambapy=False by default Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * ran make fix-copies --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- .../models/mamba/configuration_mamba.py | 4 ++ .../models/mamba/modeling_mamba.py | 58 ++++++++++++++----- src/transformers/utils/import_utils.py | 6 ++ 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/mamba/configuration_mamba.py b/src/transformers/models/mamba/configuration_mamba.py index 460c1f3b32ac..89f08dd3cd32 100644 --- a/src/transformers/models/mamba/configuration_mamba.py +++ b/src/transformers/models/mamba/configuration_mamba.py @@ -79,6 +79,8 @@ class MambaConfig(PretrainedConfig): Whether or not to rescale `out_proj` weights when initializing. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the cache should be used. + use_mambapy (`bool`, *optional*, defaults to `False`): + Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not avaiable. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited. Example: @@ -123,6 +125,7 @@ def __init__( time_step_floor=1e-4, rescale_prenorm_residual=False, use_cache=True, + use_mambapy=False, **kwargs, ): self.vocab_size = vocab_size @@ -149,5 +152,6 @@ def __init__( self.rescale_prenorm_residual = rescale_prenorm_residual self.residual_in_fp32 = residual_in_fp32 self.use_cache = use_cache + self.use_mambapy = use_mambapy super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py index 5edb28ad7416..50c0f9ebe4a5 100644 --- a/src/transformers/models/mamba/modeling_mamba.py +++ b/src/transformers/models/mamba/modeling_mamba.py @@ -33,12 +33,17 @@ add_start_docstrings_to_model_forward, logging, ) -from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available +from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available, is_mambapy_available from .configuration_mamba import MambaConfig logger = logging.get_logger(__name__) +if is_mambapy_available(): + from mambapy.pscan import pscan +else: + pscan = None + if is_mamba_ssm_available(): from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, selective_scan_fn from mamba_ssm.ops.triton.selective_state_update import selective_state_update @@ -87,6 +92,8 @@ def __init__(self, config: MambaConfig, layer_idx: int): self.activation = config.hidden_act self.act = ACT2FN[config.hidden_act] + self.use_mambapy = config.use_mambapy + # projection of the input hidden states self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=config.use_bias) # selective projection used to make dt, B and C input dependant @@ -105,11 +112,23 @@ def __init__(self, config: MambaConfig, layer_idx: int): self.use_bias = config.use_bias if not is_fast_path_available: - logger.warning_once( - "The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`" - " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" - " https://github.com/Dao-AILab/causal-conv1d" - ) + if self.use_mambapy: + if is_mambapy_available(): + logger.warning_once( + "The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`" + " is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation and" + " https://github.com/Dao-AILab/causal-conv1d" + ) + else: + raise ImportError( + "use_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py." + ) + else: + logger.warning_once( + "The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`" + " is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and" + " https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py." + ) def cuda_kernels_forward( self, @@ -257,17 +276,24 @@ def slow_forward(self, input_states, cache_params: Optional[MambaCache]=None, ca deltaB_u = discrete_B * hidden_states[:, :, :, None].float() # 3.c perform the recurrence y ← SSM(A, B, C)(x) - scan_outputs = [] - for i in range(seq_len): - ssm_state = discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :] # [batch, intermediate_size, ssm_state] - scan_output = torch.matmul(ssm_state.to(dtype), C[:, i, :].unsqueeze(-1)) # [batch, intermediate_size, 1] - scan_outputs.append(scan_output[:, :, 0]) - scan_output = torch.stack(scan_outputs, dim=-1) # [batch, intermediate_size, seq_len] - scan_output = scan_output + (hidden_states * self.D[None, :, None]) - scan_output = (scan_output * self.act(gate)) + if self.use_mambapy and self.training and cache_params is None: + hs = pscan(discrete_A.transpose(1, 2), deltaB_u.transpose(1, 2)) # [batch, seq_len, intermediate_size, ssm_state_size] - if cache_params is not None: - cache_params.update_ssm_state(self.layer_idx, ssm_state) + scan_output = (hs @ C.unsqueeze(-1)).squeeze(3).transpose(1, 2) # [batch, intermediate_size, seq_len] + scan_output = scan_output + hidden_states * self.D[None, :, None] + scan_output = scan_output * self.act(gate) + else: + scan_outputs = [] + for i in range(seq_len): + ssm_state = discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :] # [batch, intermediade_size, ssm_state] + scan_output = torch.matmul(ssm_state.to(dtype), C[:, i, :].unsqueeze(-1)) # [batch, intermediade_size, 1] + scan_outputs.append(scan_output[:, :, 0]) + scan_output = torch.stack(scan_outputs, dim=-1) # [batch, seq_len, intermediade_size] + scan_output = scan_output + (hidden_states * self.D[None, :, None]) + scan_output = (scan_output * self.act(gate)) + + if cache_params is not None: + cache_params.ssm_states[self.layer_idx].copy_(ssm_state) # 4. Final linear projection contextualized_states = self.out_proj(scan_output.transpose(1, 2)) # [batch, seq_len, hidden_size] diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index a5ea4eb1850c..c52da62c1de8 100755 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -395,6 +395,12 @@ def is_causal_conv1d_available(): return False +def is_mambapy_available(): + if is_torch_available(): + return _is_package_available("mambapy") + return False + + def is_torch_mps_available(): if is_torch_available(): import torch From 034b47784765e37ecc20f7ad43640f1a2c0094fd Mon Sep 17 00:00:00 2001 From: Amit Garg Date: Tue, 23 Jul 2024 03:33:22 -0700 Subject: [PATCH 015/200] Rename Phi-3 rope scaling type (#31436) * renamed phi3 rope_scaling type * fixed trailing whitespaces * fixed test * added warning * fixed format --- .../models/phi3/configuration_phi3.py | 20 ++++++- src/transformers/models/phi3/modeling_phi3.py | 58 ++++++++++++++++--- tests/models/phi3/test_modeling_phi3.py | 2 +- 3 files changed, 69 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index 8e1ac3628c2b..4940f43e5bff 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -78,7 +78,7 @@ class Phi3Config(PretrainedConfig): The base period of the RoPE embeddings. rope_scaling (`dict`, *optional*): The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must - contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and + contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size divided by the number of attention heads divided by 2. bos_token_id (`int`, *optional*, defaults to 1): @@ -155,6 +155,7 @@ def __init__( self.use_cache = use_cache self.rope_theta = rope_theta self.rope_scaling = rope_scaling + self._rope_scaling_adjustment() self._rope_scaling_validation() self.sliding_window = sliding_window @@ -166,6 +167,19 @@ def __init__( **kwargs, ) + def _rope_scaling_adjustment(self): + """ + Adjust the `type` of the `rope_scaling` configuration for backward compatibility. + """ + if self.rope_scaling is None: + return + + rope_scaling_type = self.rope_scaling.get("type", None) + + # For backward compatibility if previous version used "su" or "yarn" + if rope_scaling_type is not None and rope_scaling_type in ["su", "yarn"]: + self.rope_scaling["type"] = "longrope" + def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. @@ -181,8 +195,8 @@ def _rope_scaling_validation(self): rope_scaling_type = self.rope_scaling.get("type", None) rope_scaling_short_factor = self.rope_scaling.get("short_factor", None) rope_scaling_long_factor = self.rope_scaling.get("long_factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]: - raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}") + if rope_scaling_type is None or rope_scaling_type not in ["longrope"]: + raise ValueError(f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}") if not ( isinstance(rope_scaling_short_factor, list) and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor) diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index a32f8531e487..76e3fbf514f6 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -16,6 +16,7 @@ """PyTorch Phi-3 model.""" import math +import warnings from typing import List, Optional, Tuple, Union import torch @@ -106,6 +107,51 @@ def forward(self, x, position_ids, seq_len=None): class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding): def __init__(self, dim, config, device=None): + warnings.warn( + "The class Phi3SuScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers. Please" + " use Phi3LongRoPEScaledRotaryEmbedding instead.", + FutureWarning, + ) + super().__init__(dim, config.max_position_embeddings, config.rope_theta, device) + + self.short_factor = config.rope_scaling["short_factor"] + self.long_factor = config.rope_scaling["long_factor"] + self.original_max_position_embeddings = config.original_max_position_embeddings + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + seq_len = torch.max(position_ids) + 1 + if seq_len > self.original_max_position_embeddings: + ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device) + else: + ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device) + inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim + self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape) + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + scale = self.max_position_embeddings / self.original_max_position_embeddings + if scale <= 1.0: + scaling_factor = 1.0 + else: + scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings)) + cos = emb.cos() * scaling_factor + sin = emb.sin() * scaling_factor + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding): + def __init__(self, dim, config, device=None): + warnings.warn( + "The class Phi3YarnScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers", + FutureWarning, + ) super().__init__(dim, config.max_position_embeddings, config.rope_theta, device) self.short_factor = config.rope_scaling["short_factor"] @@ -138,14 +184,14 @@ def forward(self, x, position_ids, seq_len=None): if scale <= 1.0: scaling_factor = 1.0 else: - scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings)) + scaling_factor = 0.1 * math.log(scale) + 1.0 cos = emb.cos() * scaling_factor sin = emb.sin() * scaling_factor return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding): +class Phi3LongRoPEScaledRotaryEmbedding(Phi3RotaryEmbedding): def __init__(self, dim, config, device=None): super().__init__(dim, config.max_position_embeddings, config.rope_theta, device) @@ -179,7 +225,7 @@ def forward(self, x, position_ids, seq_len=None): if scale <= 1.0: scaling_factor = 1.0 else: - scaling_factor = 0.1 * math.log(scale) + 1.0 + scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings)) cos = emb.cos() * scaling_factor sin = emb.sin() * scaling_factor @@ -300,10 +346,8 @@ def _init_rope(self): ) else: scaling_type = self.config.rope_scaling["type"] - if scaling_type == "su": - self.rotary_emb = Phi3SuScaledRotaryEmbedding(self.head_dim, self.config) - elif scaling_type == "yarn": - self.rotary_emb = Phi3YarnScaledRotaryEmbedding(self.head_dim, self.config) + if scaling_type == "longrope": + self.rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(self.head_dim, self.config) else: raise ValueError(f"Unknown RoPE scaling type {scaling_type}") diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py index ad9c4c46aa93..1ddc73961bfe 100644 --- a/tests/models/phi3/test_modeling_phi3.py +++ b/tests/models/phi3/test_modeling_phi3.py @@ -362,7 +362,7 @@ def test_phi3_sequence_classification_model_for_multi_label(self): result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) - @parameterized.expand([("su",), ("yarn",)]) + @parameterized.expand([("longrope",)]) def test_model_rope_scaling_from_config(self, scaling_type): config, _ = self.model_tester.prepare_config_and_inputs_for_common() short_input = ids_tensor([1, 10], config.vocab_size) From 3263b3435473cbb5dc66925bc29c1d32b5b8d431 Mon Sep 17 00:00:00 2001 From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Date: Tue, 23 Jul 2024 18:34:30 +0800 Subject: [PATCH 016/200] Revert "Incorrect Whisper long-form decoding timestamps " (#32148) Revert "Incorrect Whisper long-form decoding timestamps (#32003)" This reverts commit cd48553fc8375e1a28d4d82cfe231dedf6a23af8. --- .../models/clvp/processing_clvp.py | 1 + .../models/whisper/processing_whisper.py | 7 -- .../models/whisper/tokenization_whisper.py | 42 ++++-------- .../whisper/tokenization_whisper_fast.py | 42 ++++-------- tests/models/whisper/test_modeling_whisper.py | 66 ------------------- 5 files changed, 25 insertions(+), 133 deletions(-) diff --git a/src/transformers/models/clvp/processing_clvp.py b/src/transformers/models/clvp/processing_clvp.py index ebccab89d0fc..4e015cea1f84 100644 --- a/src/transformers/models/clvp/processing_clvp.py +++ b/src/transformers/models/clvp/processing_clvp.py @@ -73,6 +73,7 @@ def __call__(self, *args, **kwargs): inputs["attention_mask"] = encodings["attention_mask"] return inputs + # Copied from transformers.models.whisper.processing_whisper.WhisperProcessor.batch_decode with Whisper->Clvp def batch_decode(self, *args, **kwargs): """ This method forwards all its arguments to ClvpTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py index 07ece4314b24..f22aae143e6b 100644 --- a/src/transformers/models/whisper/processing_whisper.py +++ b/src/transformers/models/whisper/processing_whisper.py @@ -84,13 +84,6 @@ def batch_decode(self, *args, **kwargs): This method forwards all its arguments to WhisperTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more information. """ - - # If segments are present in args, we are performing long-form generation and need to return long form timestamps. - # The long-form timestamps are already present in segments and should be passed as kwargs to batch_decode. - if isinstance(args[0], dict) and "segments" in args[0]: - kwargs["longform_timestamps"] = args[0].pop("segments") - args = tuple(args[0]["sequences"].unsqueeze(0)) - return self.tokenizer.batch_decode(*args, **kwargs) def decode(self, *args, **kwargs): diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py index 0ff0bc8245ed..85e7dd04d8b2 100644 --- a/src/transformers/models/whisper/tokenization_whisper.py +++ b/src/transformers/models/whisper/tokenization_whisper.py @@ -558,7 +558,7 @@ def _decode_with_timestamps(self, token_ids, skip_special_tokens=False, time_pre ] return "".join(outputs) - def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=None): + def _compute_offsets(self, token_ids, time_precision=0.02): """ Compute offsets for a given tokenized input @@ -567,8 +567,6 @@ def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=N List of tokenized input ids. Can be obtained using the `__call__` method. time_precision (`float`, `optional`, defaults to 0.02): The time ratio to convert from token to time. - longform_timestamps (List[dict], *optional*): - Timestamps obtained using long form generation in Whisper, to be used to replace predicted timestamps in token_ids. """ offsets = [] # ensure torch tensor of token ids is placed on cpu @@ -589,7 +587,7 @@ def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=N consecutive = np.append(consecutive, np.where(timestamp_tokens)[0][-1] + 1) last_slice = np.where(timestamp_tokens)[0][0] - for i, current_slice in enumerate(consecutive): + for current_slice in consecutive: sliced_tokens = token_ids[last_slice:current_slice] if len(sliced_tokens) > 1: start_timestamp_position = sliced_tokens[0].item() - timestamp_begin @@ -598,27 +596,15 @@ def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=N sliced_tokens = self._preprocess_token_ids(sliced_tokens) text = self._decode(sliced_tokens) text = self._filter_timestamp_ids(text) - - if longform_timestamps is not None: - offsets.append( - { - "text": text, - "timestamp": ( - longform_timestamps[0][i]["start"].item(), - longform_timestamps[0][i]["end"].item(), - ), - } - ) - else: - offsets.append( - { - "text": text, - "timestamp": ( - start_timestamp_position * time_precision, - end_timestamp_position * time_precision, - ), - } - ) + offsets.append( + { + "text": text, + "timestamp": ( + start_timestamp_position * time_precision, + end_timestamp_position * time_precision, + ), + } + ) last_slice = current_slice return offsets @@ -727,11 +713,7 @@ def decode( # retrieve offsets if output_offsets: - longform_timestamps = kwargs.get("longform_timestamps") - offsets = self._compute_offsets( - token_ids, time_precision=time_precision, longform_timestamps=longform_timestamps - ) - + offsets = self._compute_offsets(token_ids, time_precision=time_precision) return {"text": text, "offsets": offsets} return text diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py index 540056df8bd8..d1205d1a8ec0 100644 --- a/src/transformers/models/whisper/tokenization_whisper_fast.py +++ b/src/transformers/models/whisper/tokenization_whisper_fast.py @@ -200,7 +200,7 @@ def _decode_with_timestamps(self, token_ids, skip_special_tokens=False, time_pre return "".join(outputs) # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._compute_offsets - def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=None): + def _compute_offsets(self, token_ids, time_precision=0.02): """ Compute offsets for a given tokenized input @@ -209,8 +209,6 @@ def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=N List of tokenized input ids. Can be obtained using the `__call__` method. time_precision (`float`, `optional`, defaults to 0.02): The time ratio to convert from token to time. - longform_timestamps (List[dict], *optional*): - Timestamps obtained using long form generation in Whisper, to be used to replace predicted timestamps in token_ids. """ offsets = [] # ensure torch tensor of token ids is placed on cpu @@ -231,7 +229,7 @@ def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=N consecutive = np.append(consecutive, np.where(timestamp_tokens)[0][-1] + 1) last_slice = np.where(timestamp_tokens)[0][0] - for i, current_slice in enumerate(consecutive): + for current_slice in consecutive: sliced_tokens = token_ids[last_slice:current_slice] if len(sliced_tokens) > 1: start_timestamp_position = sliced_tokens[0].item() - timestamp_begin @@ -240,27 +238,15 @@ def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=N sliced_tokens = self._preprocess_token_ids(sliced_tokens) text = self._decode(sliced_tokens) text = self._filter_timestamp_ids(text) - - if longform_timestamps is not None: - offsets.append( - { - "text": text, - "timestamp": ( - longform_timestamps[0][i]["start"].item(), - longform_timestamps[0][i]["end"].item(), - ), - } - ) - else: - offsets.append( - { - "text": text, - "timestamp": ( - start_timestamp_position * time_precision, - end_timestamp_position * time_precision, - ), - } - ) + offsets.append( + { + "text": text, + "timestamp": ( + start_timestamp_position * time_precision, + end_timestamp_position * time_precision, + ), + } + ) last_slice = current_slice return offsets @@ -373,11 +359,7 @@ def decode( # retrieve offsets if output_offsets: - longform_timestamps = kwargs.get("longform_timestamps") - offsets = self._compute_offsets( - token_ids, time_precision=time_precision, longform_timestamps=longform_timestamps - ) - + offsets = self._compute_offsets(token_ids, time_precision=time_precision) return {"text": text, "offsets": offsets} return text diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index 5a59f7a72517..38ccf82af462 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -2243,72 +2243,6 @@ def test_tiny_timestamp_generation(self): transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True) self.assertEqual(transcript, EXPECTED_TRANSCRIPT) - @slow - def test_tiny_longform_timestamps_generation(self): - set_seed(0) - processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") - model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") - model.to(torch_device) - - sample = self._load_datasamples(1) - input_speech = np.concatenate(sample * 10) - - input_features = processor(input_speech, return_tensors="pt", truncation=False, sampling_rate=16_000) - input_features = input_features.to(torch_device) - - generated_ids = model.generate(**input_features, return_timestamps=True, return_segments=True) - - EXPECTED_TRANSCRIPT = [ - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "offsets": [ - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (0.0, 6.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (6.0, 12.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (12.0, 18.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (18.0, 24.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (24.0, 29.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (29.0, 35.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (35.0, 41.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (41.0, 47.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (47.0, 53.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (53.0, 58.20000076293945), - }, - ], - } - ] - - transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True) - self.assertEqual(transcript, EXPECTED_TRANSCRIPT) - @slow def test_large_timestamp_generation(self): set_seed(0) From a009fbdab32a4b068c24052a4dfe7a7bc0fc89f9 Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Tue, 23 Jul 2024 12:23:34 +0100 Subject: [PATCH 017/200] Fix typing to be compatible with later py versions (#32155) --- src/transformers/tokenization_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 7ffd3bbcaa6b..9e478b63b284 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1948,7 +1948,7 @@ def is_active(self) -> bool: return self._rendered_blocks or self._generation_indices @contextmanager - def activate_tracker(self, rendered_blocks: list[int], generation_indices: list[int]): + def activate_tracker(self, rendered_blocks: List[int], generation_indices: List[int]): try: if self.is_active(): raise ValueError("AssistantTracker should not be reused before closed") From 63700628adb91600c84fe3bbbc4c667cd3e3aa71 Mon Sep 17 00:00:00 2001 From: Alvaro Moran <6949769+tengomucho@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:18:19 +0200 Subject: [PATCH 018/200] feat(cache): StaticCache uses index_copy_ to avoid useless copy (#31857) * feat(cache): StaticCache uses index_copy_ to avoid useless copy Using index_copy_ allows for explicit in-place change of the tensor. Some backends (XLA) will otherwise copy the tensor, making the code slower and using more memory. Proposed implementation will end up using less memory and on XLA will result in less compilation, but the change is also quite generic, making no change whatsoever on CUDA or CPU backend. * feat(cache): SlidingWindowCache uses index_copy_ to avoid useless copy Applying the same change done in StaticCache. * fix(cache): fallback of index_copy_ when not implemented * fix(cache): in index_copy_ ensure tensors are on same device * [run slow] llama * fix(cache): add move of cache_position to same device in SlidingWindowCache * Revert "[run slow] llama" This reverts commit 02608dd14253ccd464e31c108e0cd94364f0e8b9. --- src/transformers/cache_utils.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py index 34b457ce0189..0c03ea2735db 100644 --- a/src/transformers/cache_utils.py +++ b/src/transformers/cache_utils.py @@ -862,8 +862,18 @@ def update( k_out.copy_(key_states) v_out.copy_(value_states) else: - k_out[:, :, cache_position] = key_states - v_out[:, :, cache_position] = value_states + # Note: here we use `tensor.index_copy_(dim, index, tensor)` that is equivalent to + # `tensor[:, :, index] = tensor`, but the first one is compile-friendly and it does explicitly an in-place + # operation, that avoids copies and uses less memory. + try: + # If using several devices (e.g.: multiple GPUs), we need to ensure everything is on the same one + cache_position.to(device=k_out.device) + k_out.index_copy_(2, cache_position, key_states) + v_out.index_copy_(2, cache_position, value_states) + except NotImplementedError: + # The operator 'aten::index_copy.out' is not currently implemented for the MPS device. + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states return k_out, v_out @@ -958,8 +968,14 @@ def update( k_out = k_out[:, :, indices] v_out = v_out[:, :, indices] - k_out[:, :, cache_position] = key_states - v_out[:, :, cache_position] = value_states + try: + cache_position.to(device=k_out.device) + k_out.index_copy_(2, cache_position, key_states) + v_out.index_copy_(2, cache_position, value_states) + except NotImplementedError: + # The operator 'aten::index_copy.out' is not currently implemented for the MPS device. + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states # `_.zero()` followed by `+=` is equivalent `=`, but compile-friendly (without graph breaks due to assignment) self.key_cache[layer_idx].zero_() From 7d92009af647167bae338e9d4af8bc0452c62fbf Mon Sep 17 00:00:00 2001 From: Deep Gandhi <97520292+DeF0017@users.noreply.github.com> Date: Tue, 23 Jul 2024 19:11:52 +0530 Subject: [PATCH 019/200] Added additional kwarg for successful running of optuna hyperparameter search (#31924) Update integration_utils.py Added additional kwarg --- src/transformers/integrations/integration_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index e9c91192ecf9..399c9f60cfa6 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -252,10 +252,11 @@ def _objective(trial, checkpoint_dir=None): timeout = kwargs.pop("timeout", None) n_jobs = kwargs.pop("n_jobs", 1) + gc_after_trial = kwargs.pop("gc_after_trial", False) directions = direction if isinstance(direction, list) else None direction = None if directions is not None else direction study = optuna.create_study(direction=direction, directions=directions, **kwargs) - study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs) + study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs, gc_after_trial=gc_after_trial) if not study._is_multi_objective(): best_trial = study.best_trial return BestRun(str(best_trial.number), best_trial.value, best_trial.params) From 9cf4f2aa9a9cecbb22e813931ef3bb72fc773540 Mon Sep 17 00:00:00 2001 From: RhuiDih <166782544+RhuiDih@users.noreply.github.com> Date: Tue, 23 Jul 2024 21:56:41 +0800 Subject: [PATCH 020/200] Enhancing SFT Training Efficiency Using Packing and FlashAttention2 with Position IDs (#31629) * add DataCollatorBatchFlattening * Update data_collator.py * change name * new FA2 flow if position_ids is provided * add comments * minor fix * minor fix data collator * add test cases for models * add test case for data collator * remove extra code * formating for ruff check and check_repo.py * ruff format ruff format tests src utils * custom_init_isort.py --- docs/source/en/main_classes/data_collator.md | 5 ++ src/transformers/__init__.py | 2 + src/transformers/data/__init__.py | 1 + src/transformers/data/data_collator.py | 35 ++++++++ .../modeling_flash_attention_utils.py | 79 +++++++++++++++++++ src/transformers/models/dbrx/modeling_dbrx.py | 1 + .../models/falcon/modeling_falcon.py | 1 + .../models/gemma/modeling_gemma.py | 1 + .../models/llama/modeling_llama.py | 1 + .../models/mistral/modeling_mistral.py | 1 + .../models/mixtral/modeling_mixtral.py | 1 + src/transformers/models/olmo/modeling_olmo.py | 1 + src/transformers/models/phi/modeling_phi.py | 1 + src/transformers/models/phi3/modeling_phi3.py | 1 + .../models/qwen2/modeling_qwen2.py | 1 + .../models/qwen2_moe/modeling_qwen2_moe.py | 1 + .../models/stablelm/modeling_stablelm.py | 1 + .../models/starcoder2/modeling_starcoder2.py | 1 + tests/test_modeling_common.py | 72 +++++++++++++++++ tests/trainer/test_data_collator.py | 19 +++++ 20 files changed, 226 insertions(+) diff --git a/docs/source/en/main_classes/data_collator.md b/docs/source/en/main_classes/data_collator.md index 74e653dd1185..e704bb747fe6 100644 --- a/docs/source/en/main_classes/data_collator.md +++ b/docs/source/en/main_classes/data_collator.md @@ -66,3 +66,8 @@ Examples of use can be found in the [example scripts](../examples) or [example n - numpy_mask_tokens - tf_mask_tokens - torch_mask_tokens + +## DataCollatorWithFlattening + +[[autodoc]] data.data_collator.DataCollatorWithFlattening + diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index bc6e786358b6..05becb96e0b8 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -103,6 +103,7 @@ "DataCollatorForSOP", "DataCollatorForTokenClassification", "DataCollatorForWholeWordMask", + "DataCollatorWithFlattening", "DataCollatorWithPadding", "DefaultDataCollator", "default_data_collator", @@ -4764,6 +4765,7 @@ DataCollatorForSOP, DataCollatorForTokenClassification, DataCollatorForWholeWordMask, + DataCollatorWithFlattening, DataCollatorWithPadding, DefaultDataCollator, default_data_collator, diff --git a/src/transformers/data/__init__.py b/src/transformers/data/__init__.py index 1a8ef35ff439..8b675aae281f 100644 --- a/src/transformers/data/__init__.py +++ b/src/transformers/data/__init__.py @@ -19,6 +19,7 @@ DataCollatorForSOP, DataCollatorForTokenClassification, DataCollatorForWholeWordMask, + DataCollatorWithFlattening, DataCollatorWithPadding, DefaultDataCollator, default_data_collator, diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index ce17f79ccfc8..20a21318786c 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -1611,3 +1611,38 @@ def numpy_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]: ) & masked_indices[i] return inputs.astype(np.int64), perm_mask, target_mapping, labels.astype(np.int64) + + +@dataclass +class DataCollatorWithFlattening(DefaultDataCollator): + """ + Data collator used for padding free approach. Does the following: + + - concatate the entire mini batch into single long sequence [1, total_tokens] + - no padding will be added, returns `input_ids`, `labels` and `position_ids` + """ + + def __init__(self, *args, return_position_ids=True, **kwargs): + super().__init__(*args, **kwargs) + self.return_position_ids = return_position_ids + warnings.warn( + "Using `DataCollatorWithFlattening` will flatten the entire mini batch into single long sequence." + "Make sure your attention computation is able to handle it!" + ) + + def __call__(self, features, return_tensors=None): + if return_tensors is None: + return_tensors = self.return_tensors + is_labels_provided = "labels" in features[0] + ret = {"input_ids": [], "labels": []} + if self.return_position_ids: + ret.update({"position_ids": []}) + for idx in range(0, len(features)): + ret["input_ids"] += features[idx]["input_ids"] + if is_labels_provided: + ret["labels"] += [-100] + features[idx]["labels"][1:] + else: + ret["labels"] += [-100] + features[idx]["input_ids"][1:] + if self.return_position_ids: + ret["position_ids"] += list(range(len(features[idx]["input_ids"]))) + return default_data_collator([ret], return_tensors) diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py index 1742e419b4aa..88dd99e6901d 100644 --- a/src/transformers/modeling_flash_attention_utils.py +++ b/src/transformers/modeling_flash_attention_utils.py @@ -130,6 +130,56 @@ def _upad_input( ) +def prepare_fa2_from_position_ids(query, key, value, position_ids): + """ + This function returns necessary arguments to call `flash_attn_varlen_func`. + All three query, key, value states will be flattened. + Cummulative lengths of each examples in the batch will be extracted from position_ids. + + NOTE: ideally cummulative lengths should be prepared at the data collator stage + + Arguments: + query (`torch.Tensor`): + Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim). + key (`torch.Tensor`): + Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim). + value (`torch.Tensor`): + Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim). + position_ids (`torch.Tensor`): + Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid. + + Return: + query (`torch.Tensor): + Query state without padding. Shape: (total_target_length, num_heads, head_dim). + key (`torch.Tensor`): + Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim). + value (`torch.Tensor`): + Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim). + indices_q (`torch.Tensor`): + The indices of non-masked tokens from the flattened input target sequence. + (cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`): + The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,). + (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`): + Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value). + """ + query = query.view(-1, query.size(-2), query.size(-1)) + key = key.view(-1, key.size(-2), key.size(-1)) + value = value.view(-1, value.size(-2), value.size(-1)) + position_ids = position_ids.flatten() + indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32) + + cu_seq_lens = torch.cat( + ( + indices_q[position_ids == 0], + torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32), + ) + ) + + max_length = position_ids.max() + 1 + + return (query, key, value, indices_q, (cu_seq_lens, cu_seq_lens), (max_length, max_length)) + + def _flash_attention_forward( query_states: torch.Tensor, key_states: torch.Tensor, @@ -138,6 +188,7 @@ def _flash_attention_forward( query_length: int, is_causal: bool, dropout: float = 0.0, + position_ids: Optional[torch.Tensor] = None, softmax_scale: Optional[float] = None, sliding_window: Optional[int] = None, use_top_left_mask: bool = False, @@ -210,6 +261,34 @@ def _flash_attention_forward( **flash_kwargs, ) attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + + # if position_ids is provided and check not all examples (row) contain only 1 sequence, + # then use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach + elif position_ids is not None and not (position_ids[:, -1] == position_ids.size(1) - 1).all(): + batch_size = query_states.size(0) + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids( + query_states, key_states, value_states, position_ids + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + **flash_kwargs, + ) + + attn_output = attn_output.view(batch_size, -1, attn_output.size(-2), attn_output.size(-1)) + else: attn_output = flash_attn_func( query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal, **flash_kwargs diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index 31810028ef44..e3be8decbc6b 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -415,6 +415,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, is_causal=self.is_causal, use_top_left_mask=self._flash_attn_uses_top_left_mask, diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 663582c8a72a..fc7a38ed134d 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -602,6 +602,7 @@ def forward( value_layer, attention_mask, query_length, + position_ids=position_ids, dropout=attn_dropout, is_causal=self.is_causal, use_top_left_mask=self._flash_attn_uses_top_left_mask, diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 80e97fe700b5..5bc1af3e7ec7 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -393,6 +393,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=getattr(self, "sliding_window", None), is_causal=self.is_causal, diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 3115cee78f76..ce76d1d1ec1b 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -503,6 +503,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=getattr(self, "sliding_window", None), use_top_left_mask=self._flash_attn_uses_top_left_mask, diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index dd814cd75fb1..93a60a49dbf3 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -382,6 +382,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=getattr(self.config, "sliding_window", None), use_top_left_mask=self._flash_attn_uses_top_left_mask, diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 82320de79386..d2ee6e6b268a 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -488,6 +488,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=getattr(self.config, "sliding_window", None), is_causal=self.is_causal, diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index a56baf0653ec..74d49d5606c1 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -428,6 +428,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, use_top_left_mask=self._flash_attn_uses_top_left_mask, is_causal=self.is_causal, diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index f80453d3f7d9..1b23be39e5c0 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -501,6 +501,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=attn_dropout, softmax_scale=None, use_top_left_mask=self._flash_attn_uses_top_left_mask, diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 76e3fbf514f6..90b815184b07 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -563,6 +563,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=attn_dropout, sliding_window=getattr(self.config, "sliding_window", None), use_top_left_mask=self._flash_attn_uses_top_left_mask, diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 68923ed4052d..1ff8896ae5f9 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -429,6 +429,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=sliding_window, is_causal=self.is_causal, diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index d88b5c357e86..54e91da3347d 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -508,6 +508,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=sliding_window, is_causal=self.is_causal, diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index ea50a20edea8..3a3b6a9e05f1 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -606,6 +606,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, use_top_left_mask=self._flash_attn_uses_top_left_mask, is_causal=self.is_causal, diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index af532b139ca3..f2786f9df48a 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -404,6 +404,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=getattr(self.config, "sliding_window", None), is_causal=self.is_causal, diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 19a945aec527..abe5ddea2c25 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -4327,6 +4327,78 @@ def test_flash_attn_2_fp32_ln(self): # with attention mask _ = model(dummy_input, attention_mask=dummy_attention_mask) + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + max_new_tokens = 30 + + for model_class in self.all_generative_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + dummy_input = inputs_dict[model_class.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + assert 0 in inputs_dict["attention_mask"], "assert padding in testing inputs" + # ensure left padding, to adapt for some models + if 0 in inputs_dict["attention_mask"][:, -1]: + inputs_dict["attention_mask"] = inputs_dict["attention_mask"].flip(1) + dummy_attention_mask = inputs_dict["attention_mask"] + inputs_dict["input_ids"][~dummy_attention_mask.bool()] = config.pad_token_id + + model = ( + model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", + low_cpu_mem_usage=True, + ) + .to(torch_device) + .eval() + ) + + # flatten + padfree_inputs_dict = { + k: v[dummy_attention_mask.bool()].unsqueeze(0) + for k, v in inputs_dict.items() + if not k == "attention_mask" + } + # add position_ids + padfree_inputs_dict["position_ids"] = ( + torch.cat([torch.arange(length) for length in dummy_attention_mask.sum(1).tolist()]) + .long() + .unsqueeze(0) + .to(torch_device) + ) + + res_padded = model(**inputs_dict) + res_padfree = model(**padfree_inputs_dict) + + logits_padded = res_padded.logits[inputs_dict["attention_mask"].bool()] + logits_padfree = res_padfree.logits[0] + + torch.testing.assert_close(logits_padded.argmax(-1), logits_padfree.argmax(-1), atol=0, rtol=0) + # acceptable numerical instability + tol = torch.finfo(torch.float16).eps + torch.testing.assert_close(logits_padded, logits_padfree, atol=tol, rtol=tol) + @is_pt_tf_cross_test def test_tf_from_pt_safetensors(self): for model_class in self.all_model_classes: diff --git a/tests/trainer/test_data_collator.py b/tests/trainer/test_data_collator.py index 36e1813258d1..8c1f593ff4bc 100644 --- a/tests/trainer/test_data_collator.py +++ b/tests/trainer/test_data_collator.py @@ -26,6 +26,7 @@ DataCollatorForSeq2Seq, DataCollatorForTokenClassification, DataCollatorForWholeWordMask, + DataCollatorWithFlattening, DataCollatorWithPadding, default_data_collator, is_tf_available, @@ -1531,6 +1532,24 @@ def test_data_collator_with_padding(self): batch = data_collator(features) self.assertEqual(batch["input_ids"].shape, (2, 8)) + def test_data_collator_with_flattening(self): + features = [ + {"input_ids": [10, 11, 12]}, + {"input_ids": [20, 21, 22, 23, 24, 25]}, + {"input_ids": [30, 31, 32, 33, 34, 35, 36]}, + ] + + data_collator = DataCollatorWithFlattening(return_tensors="np") + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape, (1, 16)) + self.assertEqual( + batch["input_ids"][0].tolist(), [10, 11, 12, 20, 21, 22, 23, 24, 25, 30, 31, 32, 33, 34, 35, 36] + ) + self.assertNotIn("attention_mask", batch) + self.assertIn("position_ids", batch) + self.assertEqual(batch["position_ids"].shape, (1, 16)) + self.assertEqual(batch["position_ids"][0].tolist(), [0, 1, 2, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 6]) + def test_data_collator_for_token_classification(self): tokenizer = BertTokenizer(self.vocab_file) features = [ From d2c687b3f1859b5c61258af14abba5312c0e6201 Mon Sep 17 00:00:00 2001 From: Sai-Suraj-27 Date: Tue, 23 Jul 2024 20:37:31 +0530 Subject: [PATCH 021/200] Updated `ruff` to the latest version (#31926) * Updated ruff version and fixed the required code accorindg to the latest version. * Updated ruff version and fixed the required code accorindg to the latest version. * Added noqa directive to ignore 1 error shown by ruff --- .../seq2seq-distillation/_test_seq2seq_examples.py | 4 ++-- setup.py | 2 +- src/transformers/dependency_versions_table.py | 2 +- src/transformers/hf_argparser.py | 2 +- src/transformers/modeling_flax_utils.py | 2 +- src/transformers/models/distilbert/modeling_distilbert.py | 2 +- .../models/distilbert/modeling_flax_distilbert.py | 2 +- src/transformers/models/esm/openfold_utils/rigid_utils.py | 4 ++-- .../models/markuplm/feature_extraction_markuplm.py | 2 +- src/transformers/models/musicgen/modeling_musicgen.py | 2 +- src/transformers/trainer_pt_utils.py | 2 +- tests/agents/test_agents.py | 2 +- tests/agents/test_python_interpreter.py | 4 ++-- tests/models/roformer/test_tokenization_roformer.py | 2 +- 14 files changed, 17 insertions(+), 17 deletions(-) diff --git a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py index 454951ed3888..0ee4dd8afe1d 100644 --- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py +++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py @@ -418,7 +418,7 @@ def test_finetune_lr_schedulers(self): with CaptureStdout() as cs: args = parser.parse_args(args) assert False, "--help is expected to sys.exit" - assert excinfo.type == SystemExit + assert excinfo.type is SystemExit expected = lightning_base.arg_to_scheduler_metavar assert expected in cs.out, "--help is expected to list the supported schedulers" @@ -429,7 +429,7 @@ def test_finetune_lr_schedulers(self): with CaptureStderr() as cs: args = parser.parse_args(args) assert False, "invalid argument is expected to sys.exit" - assert excinfo.type == SystemExit + assert excinfo.type is SystemExit expected = f"invalid choice: '{unsupported_param}'" assert expected in cs.err, f"should have bailed on invalid choice of scheduler {unsupported_param}" diff --git a/setup.py b/setup.py index f6a6875dcda6..67f1cbfd80a2 100644 --- a/setup.py +++ b/setup.py @@ -157,7 +157,7 @@ "rhoknp>=1.1.0,<1.3.1", "rjieba", "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1", - "ruff==0.4.4", + "ruff==0.5.1", "sacrebleu>=1.4.12,<2.0.0", "sacremoses", "safetensors>=0.4.1", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index fcbb8469b9e5..7644d8d68d16 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -63,7 +63,7 @@ "rhoknp": "rhoknp>=1.1.0,<1.3.1", "rjieba": "rjieba", "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1", - "ruff": "ruff==0.4.4", + "ruff": "ruff==0.5.1", "sacrebleu": "sacrebleu>=1.4.12,<2.0.0", "sacremoses": "sacremoses", "safetensors": "safetensors>=0.4.1", diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py index 045bf798050e..4b5548fffb41 100644 --- a/src/transformers/hf_argparser.py +++ b/src/transformers/hf_argparser.py @@ -164,7 +164,7 @@ def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field): ) if type(None) not in field.type.__args__: # filter `str` in Union - field.type = field.type.__args__[0] if field.type.__args__[1] == str else field.type.__args__[1] + field.type = field.type.__args__[0] if field.type.__args__[1] is str else field.type.__args__[1] origin_type = getattr(field.type, "__origin__", field.type) elif bool not in field.type.__args__: # filter `NoneType` in Union (except for `Union[bool, NoneType]`) diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py index 61077cf7c309..9d12e1e67c80 100644 --- a/src/transformers/modeling_flax_utils.py +++ b/src/transformers/modeling_flax_utils.py @@ -90,7 +90,7 @@ def dtype_byte_size(dtype): 4 ``` """ - if dtype == bool: + if dtype is bool: return 1 / 8 bit_search = re.search(r"[^\d](\d+)$", dtype.name) if bit_search is None: diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index 4d9173fd08bf..e80e3c41d22c 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -398,7 +398,7 @@ def forward( if output_attentions: sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) else: # To handle these `output_attentions` or `output_hidden_states` cases returning tuples - if type(sa_output) != tuple: + if type(sa_output) is not tuple: raise TypeError(f"sa_output must be a tuple but it is {type(sa_output)} type") sa_output = sa_output[0] diff --git a/src/transformers/models/distilbert/modeling_flax_distilbert.py b/src/transformers/models/distilbert/modeling_flax_distilbert.py index d3c48c077adc..0cb7cdb033c1 100644 --- a/src/transformers/models/distilbert/modeling_flax_distilbert.py +++ b/src/transformers/models/distilbert/modeling_flax_distilbert.py @@ -304,7 +304,7 @@ def __call__( if output_attentions: sa_output, sa_weights = sa_output else: - assert type(sa_output) == tuple + assert type(sa_output) is tuple sa_output = sa_output[0] sa_output = self.sa_layer_norm(sa_output + hidden_states) diff --git a/src/transformers/models/esm/openfold_utils/rigid_utils.py b/src/transformers/models/esm/openfold_utils/rigid_utils.py index 2bc2fe5f5c4e..08f5ce0a4f7e 100644 --- a/src/transformers/models/esm/openfold_utils/rigid_utils.py +++ b/src/transformers/models/esm/openfold_utils/rigid_utils.py @@ -343,7 +343,7 @@ def __getitem__(self, index: Any) -> Rotation: Returns: The indexed rotation """ - if type(index) != tuple: + if type(index) is not tuple: index = (index,) if self._rot_mats is not None: @@ -827,7 +827,7 @@ def __getitem__(self, index: Any) -> Rigid: Returns: The indexed tensor """ - if type(index) != tuple: + if type(index) is not tuple: index = (index,) return Rigid( diff --git a/src/transformers/models/markuplm/feature_extraction_markuplm.py b/src/transformers/models/markuplm/feature_extraction_markuplm.py index 73c16bad302b..e3effdc910a8 100644 --- a/src/transformers/models/markuplm/feature_extraction_markuplm.py +++ b/src/transformers/models/markuplm/feature_extraction_markuplm.py @@ -68,7 +68,7 @@ def get_three_from_single(self, html_string): for element in html_code.descendants: if isinstance(element, bs4.element.NavigableString): - if type(element.parent) != bs4.element.Tag: + if type(element.parent) is not bs4.element.Tag: continue text_in_this_tag = html.unescape(element).strip() diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index 7aaaeb461c13..b0e456db8add 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -2550,7 +2550,7 @@ def generate( generation_config.validate() self._validate_model_kwargs(model_kwargs.copy()) - if model_kwargs.get("encoder_outputs") is not None and type(model_kwargs["encoder_outputs"]) == tuple: + if model_kwargs.get("encoder_outputs") is not None and type(model_kwargs["encoder_outputs"]) is tuple: # wrap the unconditional outputs as a BaseModelOutput for compatibility with the rest of generate model_kwargs["encoder_outputs"] = BaseModelOutput(last_hidden_state=model_kwargs["encoder_outputs"][0]) diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py index 5c1ffd851636..69b547dec572 100644 --- a/src/transformers/trainer_pt_utils.py +++ b/src/transformers/trainer_pt_utils.py @@ -254,7 +254,7 @@ def reissue_pt_warnings(caught_warnings): # Reissue warnings that are not the SAVE_STATE_WARNING if len(caught_warnings) > 1: for w in caught_warnings: - if w.category != UserWarning or w.message != SAVE_STATE_WARNING: + if w.category is not UserWarning or w.message != SAVE_STATE_WARNING: warnings.warn(w.message, w.category) diff --git a/tests/agents/test_agents.py b/tests/agents/test_agents.py index f47d0b0c35c3..6dac8b852015 100644 --- a/tests/agents/test_agents.py +++ b/tests/agents/test_agents.py @@ -198,7 +198,7 @@ def test_react_fails_max_iterations(self): ) agent.run("What is 2 multiplied by 3.6452?") assert len(agent.logs) == 7 - assert type(agent.logs[-1]["error"]) == AgentMaxIterationsError + assert type(agent.logs[-1]["error"]) is AgentMaxIterationsError @require_torch def test_init_agent_with_different_toolsets(self): diff --git a/tests/agents/test_python_interpreter.py b/tests/agents/test_python_interpreter.py index 8614302baae7..feb923af28b4 100644 --- a/tests/agents/test_python_interpreter.py +++ b/tests/agents/test_python_interpreter.py @@ -214,7 +214,7 @@ def test_evaluate_slicing(self): def test_access_attributes(self): code = "integer = 1\nobj_class = integer.__class__\nobj_class" result = evaluate_python_code(code, {}, state={}) - assert result == int + assert result is int def test_list_comprehension(self): code = "sentence = 'THESEAGULL43'\nmeaningful_sentence = '-'.join([char.lower() for char in sentence if char.isalpha()])" @@ -591,7 +591,7 @@ def test_types_as_objects(self): code = "type_a = float(2); type_b = str; type_c = int" state = {} result = evaluate_python_code(code, {"float": float, "str": str, "int": int}, state=state) - assert result == int + assert result is int def test_tuple_id(self): code = """ diff --git a/tests/models/roformer/test_tokenization_roformer.py b/tests/models/roformer/test_tokenization_roformer.py index 2c5b9c65e967..6dfd0a385f0d 100644 --- a/tests/models/roformer/test_tokenization_roformer.py +++ b/tests/models/roformer/test_tokenization_roformer.py @@ -56,7 +56,7 @@ def test_tokenizer(self): exp_tokens = [22943, 21332, 34431, 45904, 117, 306, 1231, 1231, 2653, 33994, 1266, 100] self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), exp_tokens) - def test_rust_tokenizer(self): + def test_rust_tokenizer(self): # noqa: F811 tokenizer = self.get_rust_tokenizer() input_text, output_text = self.get_chinese_input_output_texts() tokens = tokenizer.tokenize(input_text) From ff0d708fe627d6715f9a3e97d0a7947f70437447 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 23 Jul 2024 17:12:47 +0200 Subject: [PATCH 022/200] Dev version: v4.44.0.dev0 --- examples/flax/question-answering/run_qa.py | 2 +- .../speech-recognition/run_flax_speech_recognition_seq2seq.py | 2 +- examples/flax/text-classification/run_flax_glue.py | 2 +- examples/flax/token-classification/run_flax_ner.py | 2 +- .../pytorch/audio-classification/run_audio_classification.py | 2 +- examples/pytorch/contrastive-image-text/run_clip.py | 2 +- .../pytorch/image-classification/run_image_classification.py | 2 +- .../image-classification/run_image_classification_no_trainer.py | 2 +- examples/pytorch/image-pretraining/run_mae.py | 2 +- examples/pytorch/image-pretraining/run_mim.py | 2 +- examples/pytorch/image-pretraining/run_mim_no_trainer.py | 2 +- .../pytorch/instance-segmentation/run_instance_segmentation.py | 2 +- .../run_instance_segmentation_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_clm.py | 2 +- examples/pytorch/language-modeling/run_clm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_fim.py | 2 +- examples/pytorch/language-modeling/run_fim_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_mlm.py | 2 +- examples/pytorch/language-modeling/run_mlm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_plm.py | 2 +- examples/pytorch/multiple-choice/run_swag.py | 2 +- examples/pytorch/multiple-choice/run_swag_no_trainer.py | 2 +- examples/pytorch/object-detection/run_object_detection.py | 2 +- .../pytorch/object-detection/run_object_detection_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa.py | 2 +- examples/pytorch/question-answering/run_qa_beam_search.py | 2 +- .../pytorch/question-answering/run_qa_beam_search_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa_no_trainer.py | 2 +- examples/pytorch/question-answering/run_seq2seq_qa.py | 2 +- .../pytorch/semantic-segmentation/run_semantic_segmentation.py | 2 +- .../run_semantic_segmentation_no_trainer.py | 2 +- .../pytorch/speech-recognition/run_speech_recognition_ctc.py | 2 +- .../speech-recognition/run_speech_recognition_ctc_adapter.py | 2 +- .../speech-recognition/run_speech_recognition_seq2seq.py | 2 +- examples/pytorch/summarization/run_summarization.py | 2 +- examples/pytorch/summarization/run_summarization_no_trainer.py | 2 +- examples/pytorch/text-classification/run_classification.py | 2 +- examples/pytorch/text-classification/run_glue.py | 2 +- examples/pytorch/text-classification/run_glue_no_trainer.py | 2 +- examples/pytorch/text-classification/run_xnli.py | 2 +- examples/pytorch/token-classification/run_ner.py | 2 +- examples/pytorch/token-classification/run_ner_no_trainer.py | 2 +- examples/pytorch/translation/run_translation.py | 2 +- examples/pytorch/translation/run_translation_no_trainer.py | 2 +- examples/tensorflow/contrastive-image-text/run_clip.py | 2 +- .../tensorflow/image-classification/run_image_classification.py | 2 +- examples/tensorflow/multiple-choice/run_swag.py | 2 +- examples/tensorflow/question-answering/run_qa.py | 2 +- examples/tensorflow/summarization/run_summarization.py | 2 +- examples/tensorflow/text-classification/run_glue.py | 2 +- examples/tensorflow/translation/run_translation.py | 2 +- setup.py | 2 +- src/transformers/__init__.py | 2 +- 53 files changed, 53 insertions(+), 53 deletions(-) diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index 1819b0235faf..cb708534116e 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -61,7 +61,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py index 501aaf186424..cea636c9a782 100644 --- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py +++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risk. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt") diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py index 281881c51182..5bb8a245bbd1 100755 --- a/examples/flax/text-classification/run_flax_glue.py +++ b/examples/flax/text-classification/run_flax_glue.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py index 6eb162adcb07..88f4f2f635d0 100644 --- a/examples/flax/token-classification/run_flax_ner.py +++ b/examples/flax/token-classification/run_flax_ner.py @@ -57,7 +57,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index 3c75f0b1504d..ad9fde080373 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -45,7 +45,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt") diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index c4936410c52a..60f698951d0b 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -54,7 +54,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index b7557b903fdf..3f8a2433c94f 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index e67424f6819c..ee30bd460863 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index 8af6e18b1ca3..a1624d12c01e 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -43,7 +43,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index c2c3ff818b5b..379ad7eaecae 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py index e3efbec76c44..fe1593fbdd84 100644 --- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -53,7 +53,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py index 9a29e43d7d30..e7d752a8b451 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py @@ -46,7 +46,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py index 8f57997deacb..e926b8b73ba5 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py @@ -52,7 +52,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index cf80ae83ab2e..d6f04ed75ee4 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -55,7 +55,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 7ef8d94f3e3c..577d86e4fc15 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py index 7154f1ffcd71..dd00375dfe84 100644 --- a/examples/pytorch/language-modeling/run_fim.py +++ b/examples/pytorch/language-modeling/run_fim.py @@ -58,7 +58,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py index 11c64c7c4849..6cb4a3b6bca5 100644 --- a/examples/pytorch/language-modeling/run_fim_no_trainer.py +++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index f40015d9701d..ada6a4534b67 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -54,7 +54,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 75c3b1936fd0..8fe6697c497d 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 33dc8baaa6e9..3a9b3ffa16ac 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index 51c13458ed44..7cc2a772d7ed 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 8356493762ed..697ec34ac886 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) # You should update this to your particular problem to have better documentation of `model_type` diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py index 3f1eb681df22..8250320dfbda 100644 --- a/examples/pytorch/object-detection/run_object_detection.py +++ b/examples/pytorch/object-detection/run_object_detection.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt") diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py index 296f045a5234..34d04d99d53c 100644 --- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py +++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py @@ -51,7 +51,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logging.basicConfig(level=logging.INFO) logger = get_logger(__name__) diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 83eda3e98a75..53393ddc5d8a 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index 4ba3564d6e50..bfdf1fc70a19 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index db6256aedbac..909084e65298 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 202cc7d661db..6aed16d9fba8 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index 1932df9677ce..bac49185d55b 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -46,7 +46,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index a5929205b2f5..ad92a3c99e6b 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 5ff906c22cba..5c78da9119c0 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index b4019d64f774..ffcca59f3ddb 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py index 3da281430ec2..77bd768cc4f3 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py @@ -53,7 +53,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index 501b2df1c5eb..b18bfb15ea17 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index 225b20fcd635..9bc80cc5f27e 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 325cbebd9634..3f35c72113cf 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index b4520d6af340..d37ebb7c4bbf 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index e3bb7ea94ca0..751084a6f6e3 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 5ec90faf87a5..152b8b02999d 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index 7fa0d3bc9005..18f5d37821c1 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index 937d09d74d62..65a34df2945d 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 61e14da6cb8b..085dc7879d9b 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index 7ccfc30802f5..f4194d4e30b8 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index e2e9f3d3d3da..7e7ce04f0241 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py index ba83bbe56dcf..9cd6dc08bff5 100644 --- a/examples/tensorflow/contrastive-image-text/run_clip.py +++ b/examples/tensorflow/contrastive-image-text/run_clip.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version( "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt" diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index a3ea7cf0b10d..025ab7c448b8 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -55,7 +55,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index 7d8189b087ef..d6fe9ab7cb50 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index d9758c401826..1c3de1b137f3 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -62,7 +62,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index 3f5190186dc3..b5f6f444057a 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -53,7 +53,7 @@ # region Checking dependencies # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py index ec3fbe8c61ba..c098192477a3 100644 --- a/examples/tensorflow/text-classification/run_glue.py +++ b/examples/tensorflow/text-classification/run_glue.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") task_to_keys = { "cola": ("sentence", None), diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index 7280f7a95b37..6fe65b0cf3c6 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -56,7 +56,7 @@ # region Dependencies and constants # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/setup.py b/setup.py index 67f1cbfd80a2..5815d69e4da7 100644 --- a/setup.py +++ b/setup.py @@ -430,7 +430,7 @@ def run(self): setup( name="transformers", - version="4.43.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.44.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)", author_email="transformers@huggingface.co", description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 05becb96e0b8..9108367f35b3 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -18,7 +18,7 @@ # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names # in the namespace without actually importing anything (and especially none of the backends). -__version__ = "4.43.0.dev0" +__version__ = "4.44.0.dev0" from typing import TYPE_CHECKING From d5a99dfcee6e94065cb7c83cc8ab6fc5daa0cc4e Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 23 Jul 2024 16:58:17 +0200 Subject: [PATCH 023/200] Llama 3.1 conversion Co-authored-by: Arthur Zucker --- src/transformers/modeling_rope_utils.py | 103 +++++++++++- .../models/llama/configuration_llama.py | 31 ++-- .../llama/convert_llama_weights_to_hf.py | 153 ++++++++++++------ 3 files changed, 228 insertions(+), 59 deletions(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 33055d2bf942..14a12b939405 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -129,6 +129,7 @@ def _compute_dynamic_ntk_parameters( Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ + # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling if config is not None and len(rope_kwargs) > 0: raise ValueError( "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " @@ -249,6 +250,7 @@ def _compute_longrope_parameters( Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the post-processing scaling factor applied to the computed cos/sin. """ + # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling # No need to keep BC with longrope, unreleased when this new pattern was created. if len(rope_kwargs) > 0: raise ValueError( @@ -293,6 +295,50 @@ def _compute_longrope_parameters( return inv_freq, attention_factor +def _compute_llama3_parameters( + config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies for llama 3.1. + + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin. + """ + # Gets the default RoPE parameters + inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs) + + factor = config.rope_scaling["factor"] # `8` in the original implementation + low_freq_factor = config.rope_scaling["low_freq_factor"] # `1` in the original implementation + high_freq_factor = config.rope_scaling["high_freq_factor"] # `4` in the original implementation + old_context_len = config.rope_scaling["original_max_position_embeddings"] # `8192` in the original implementation + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + new_freqs = [] + for freq in inv_freq: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + new_freqs.append(freq) + elif wavelen > low_freq_wavelen: + new_freqs.append(freq / factor) + else: + assert low_freq_wavelen != high_freq_wavelen + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + new_freqs.append((1 - smooth) * freq / factor + smooth * freq) + inv_freq = torch.tensor(new_freqs, dtype=inv_freq.dtype, device=inv_freq.device) + return inv_freq, attention_factor + + # This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters # from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE # parameterizations, as long as the callable has the same signature. @@ -302,6 +348,7 @@ def _compute_longrope_parameters( "dynamic": _compute_dynamic_ntk_parameters, "yarn": _compute_yarn_parameters, "longrope": _compute_longrope_parameters, + "llama3": _compute_llama3_parameters, } @@ -339,6 +386,20 @@ def _validate_linear_scaling_rope_parameters(config: PretrainedConfig): raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") +def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type", "factor"} + # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` + optional_keys = {"original_max_position_embeddings"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, optional_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + def _validate_yarn_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling rope_type = rope_scaling["rope_type"] @@ -374,7 +435,8 @@ def _validate_longrope_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling rope_type = rope_scaling["rope_type"] required_keys = {"rope_type", "short_factor", "long_factor"} - optional_keys = {"attention_factor", "factor"} + # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` + optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"} received_keys = set(rope_scaling.keys()) _check_received_keys(rope_type, received_keys, required_keys, optional_keys) @@ -417,13 +479,50 @@ def _validate_longrope_parameters(config: PretrainedConfig): ) +def _validate_llama3_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + low_freq_factor = rope_scaling["low_freq_factor"] + high_freq_factor = rope_scaling["high_freq_factor"] + if low_freq_factor is None or not isinstance(low_freq_factor, float): + raise ValueError(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}") + if high_freq_factor is None or not isinstance(high_freq_factor, float): + raise ValueError(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}") + if high_freq_factor < low_freq_factor: + raise ValueError( + "`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=" + f"{high_freq_factor} and low_freq_factor={low_freq_factor}" + ) + + original_max_position_embeddings = rope_scaling["original_max_position_embeddings"] + if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int): + raise ValueError( + "`rope_scaling`'s original_max_position_embeddings field must be an integer, got " + f"{original_max_position_embeddings}" + ) + if original_max_position_embeddings >= config.max_position_embeddings: + raise ValueError( + "`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got " + f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}" + ) + + # Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types. ROPE_VALIDATION_FUNCTIONS = { "default": _validate_default_rope_parameters, "linear": _validate_linear_scaling_rope_parameters, - "dynamic": _validate_linear_scaling_rope_parameters, # `dynamic` has the same validation pattern as `linear` + "dynamic": _validate_dynamic_scaling_rope_parameters, "yarn": _validate_yarn_parameters, "longrope": _validate_longrope_parameters, + "llama3": _validate_llama3_parameters, } diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 7c987ec85a04..c632a870be7a 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -73,25 +73,28 @@ class LlamaConfig(PretrainedConfig): End of stream token id. pretraining_tp (`int`, *optional*, defaults to 1): Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this - document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to understand more about it. This value is - necessary to ensure exact reproducibility of the pretraining results. Please refer to [this - issue](https://github.com/pytorch/pytorch/issues/76232). + document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to + understand more about it. This value is necessary to ensure exact reproducibility of the pretraining + results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. IMPORTANT: RoPE scaling expects - `max_position_embeddings` to remain unchanged -- some methods, like 'longrope', require the original value - to determine which scaling to apply. + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. Expected contents: `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope'], - with 'default' being the original RoPE implementation. + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. `factor` (`float`, *optional*): Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In most scaling types, a `factor` of x will enable the model to handle sequences of length x * - `max_position_embeddings`. + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. `attention_factor` (`float`, *optional*): Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention computation. If unspecified, it defaults to value recommended by the implementation, using the @@ -104,12 +107,16 @@ class LlamaConfig(PretrainedConfig): ramp function. If unspecified, it defaults to 1. `short_factor` (`List[float]`, *optional*): Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `max_position_embeddings` * `factor`). Must be a list of numbers with the same length as the hidden + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `max_position_embeddings` * `factor`). Must be a list of numbers with the same length as the hidden + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py index fd6ab4f2e926..384daab6b6d7 100644 --- a/src/transformers/models/llama/convert_llama_weights_to_hf.py +++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py @@ -17,10 +17,11 @@ import os import shutil import warnings +from typing import List import torch -from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast +from transformers import GenerationConfig, LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast from transformers.convert_slow_tokenizer import TikTokenConverter @@ -85,8 +86,12 @@ "65B": 8, "70B": 8, "70Bf": 8, + "405B": 8, + "405B-MP16": 16, } +CONTEXT_LENGTH_FOR_VERSION = {"3.1": 131072, "3": 8192, "2": 4096, "1": 2048} + def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256): return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of) @@ -107,9 +112,10 @@ def write_model( input_base_path, model_size=None, safe_serialization=True, - llama_version=1, + llama_version="1", vocab_size=None, num_shards=None, + instruct=False, ): os.makedirs(model_path, exist_ok=True) tmp_model_path = os.path.join(model_path, "tmp") @@ -125,18 +131,11 @@ def write_model( dims_per_head = dim // n_heads base = params.get("rope_theta", 10000.0) inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) - if base > 10000.0 and llama_version != 3: + if base > 10000.0 and float(llama_version) < 3: max_position_embeddings = 16384 else: - # Depending on the Llama version, the default max_position_embeddings has different values. - if llama_version == 1: - max_position_embeddings = 2048 - elif llama_version == 2: - max_position_embeddings = 4096 - elif llama_version == 3: - max_position_embeddings = 8192 - - vocab_size = vocab_size if vocab_size is not None else 32000 + max_position_embeddings = CONTEXT_LENGTH_FOR_VERSION[llama_version] + if params.get("n_kv_heads", None) is not None: num_key_value_heads = params["n_kv_heads"] # for GQA / MQA num_key_value_heads_per_shard = num_key_value_heads // num_shards @@ -144,8 +143,7 @@ def write_model( else: # compatibility with other checkpoints num_key_value_heads = n_heads num_key_value_heads_per_shard = n_heads_per_shard - key_value_dim = dims_per_head * num_key_value_heads - print(num_shards, num_key_value_heads, num_key_value_heads_per_shard, key_value_dim) + key_value_dim = dim # permute for sliced rotary def permute(w, n_heads, dim1=dim, dim2=dim): @@ -159,11 +157,9 @@ def permute(w, n_heads, dim1=dim, dim2=dim): loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu") else: # Sharded - loaded = [ - torch.load(os.path.join(input_base_path, file), map_location="cpu") - for file in sorted(os.listdir(input_base_path)) - if file.endswith(".pth") - ] + checkpoint_list = sorted([file for file in os.listdir(input_base_path) if file.endswith(".pth")]) + print("Loading in order:", checkpoint_list) + loaded = [torch.load(os.path.join(input_base_path, file), map_location="cpu") for file in checkpoint_list] param_count = 0 index_dict = {"weight_map": {}} for layer_i in range(n_layers): @@ -263,7 +259,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): "lm_head.weight": loaded["output.weight"], } else: - concat_dim = 0 if llama_version == 3 else 1 + concat_dim = 0 if llama_version in ["3", "3.1"] else 1 state_dict = { "model.norm.weight": loaded[0]["norm.weight"], "model.embed_tokens.weight": torch.cat( @@ -282,6 +278,18 @@ def permute(w, n_heads, dim1=dim, dim2=dim): write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1 multiple_of = params["multiple_of"] if "multiple_of" in params else 256 + + if llama_version in ["3", "3.1"]: + bos_token_id = 128000 + + if instruct: + eos_token_id = [128001, 128008, 128009] + else: + eos_token_id = 128001 + else: + bos_token_id = 1 + eos_token_id = 2 + config = LlamaConfig( hidden_size=dim, intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of), @@ -292,11 +300,21 @@ def permute(w, n_heads, dim1=dim, dim2=dim): vocab_size=vocab_size, rope_theta=base, max_position_embeddings=max_position_embeddings, - bos_token_id=128000 if llama_version == 3 else 1, - eos_token_id=128001 if llama_version == 3 else 2, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, ) config.save_pretrained(tmp_model_path) + if instruct: + generation_config = GenerationConfig( + do_sample=True, + temperature=0.6, + top_p=0.9, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + ) + generation_config.save_pretrained(tmp_model_path) + # Make space so we can load the model properly now. del state_dict del loaded @@ -313,7 +331,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): class Llama3Converter(TikTokenConverter): - def __init__(self, vocab_file, num_reserved_special_tokens=256, **kwargs): + def __init__(self, vocab_file, special_tokens=None, instruct=False, model_max_length=None, **kwargs): super().__init__(vocab_file, **kwargs) tokenizer = self.converted() chat_template = ( @@ -327,34 +345,24 @@ def __init__(self, vocab_file, num_reserved_special_tokens=256, **kwargs): "{% endfor %}" "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}" ) - num_reserved_special_tokens = 256 - special_tokens = [ - "<|begin_of_text|>", - "<|end_of_text|>", - "<|reserved_special_token_0|>", - "<|reserved_special_token_1|>", - "<|reserved_special_token_2|>", - "<|reserved_special_token_3|>", - "<|start_header_id|>", - "<|end_header_id|>", - "<|reserved_special_token_4|>", - "<|eot_id|>", # end of turn - ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)] tokenizer.add_special_tokens(special_tokens) self.tokenizer = PreTrainedTokenizerFast( tokenizer_object=tokenizer, bos_token="<|begin_of_text|>", - eos_token="<|end_of_text|>", - chat_template=chat_template, + eos_token="<|end_of_text|>" if not instruct else "<|eot_id|>", + chat_template=chat_template if instruct else None, model_input_names=["input_ids", "attention_mask"], + model_max_length=model_max_length, ) -def write_tokenizer(tokenizer_path, input_tokenizer_path, llama_version=2): +def write_tokenizer(tokenizer_path, input_tokenizer_path, llama_version="2", special_tokens=None, instruct=False): tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast - if llama_version == 3: - tokenizer = Llama3Converter(input_tokenizer_path).tokenizer + if llama_version in ["3", "3.1"]: + tokenizer = Llama3Converter( + input_tokenizer_path, special_tokens, instruct, model_max_length=CONTEXT_LENGTH_FOR_VERSION[llama_version] + ).tokenizer else: tokenizer = tokenizer_class(input_tokenizer_path) print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.") @@ -362,6 +370,37 @@ def write_tokenizer(tokenizer_path, input_tokenizer_path, llama_version=2): return tokenizer +DEFAULT_LLAMA_SPECIAL_TOKENS = { + "3": [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|reserved_special_token_2|>", + "<|reserved_special_token_3|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|reserved_special_token_4|>", + "<|eot_id|>", # end of turn + ] + + [f"<|reserved_special_token_{i}|>" for i in range(5, 256 - 5)], + "3.1": [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|finetune_right_pad_id|>", + "<|reserved_special_token_2|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|eom_id|>", # end of message + "<|eot_id|>", # end of turn + "<|python_tag|>", + ] + + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)], +} + + def main(): parser = argparse.ArgumentParser() parser.add_argument( @@ -383,9 +422,9 @@ def main(): # Different Llama versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used. parser.add_argument( "--llama_version", - choices=[1, 2, 3], - default=1, - type=int, + choices=["1", "2", "3", "3.1"], + default="1", + type=str, help="Version of the Llama model to convert. Currently supports Llama1 and Llama2. Controls the context size", ) parser.add_argument( @@ -394,11 +433,34 @@ def main(): type=int, help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth", ) + parser.add_argument( + "--special_tokens", + default=None, + type=List[str], + help="The list of special tokens that should be added to the model.", + ) + parser.add_argument( + "--instruct", + default=False, + type=bool, + help="Whether the model is an instruct model or not. Will affect special tokens for llama 3.1.", + ) args = parser.parse_args() if args.model_size is None and args.num_shards is None: raise ValueError("You have to set at least `num_shards` if you are not giving the `model_size`") + if args.special_tokens is None: + args.special_tokens = DEFAULT_LLAMA_SPECIAL_TOKENS[str(args.llama_version)] + spm_path = os.path.join(args.input_dir, "tokenizer.model") - vocab_size = len(write_tokenizer(args.output_dir, spm_path, llama_version=args.llama_version)) + vocab_size = len( + write_tokenizer( + args.output_dir, + spm_path, + llama_version=args.llama_version, + special_tokens=args.special_tokens, + instruct=args.instruct, + ) + ) if args.model_size != "tokenizer_only": write_model( model_path=args.output_dir, @@ -408,6 +470,7 @@ def main(): llama_version=args.llama_version, vocab_size=vocab_size, num_shards=args.num_shards, + instruct=args.instruct, ) From 23f6a43f82fb2980f4b30cf3f95eb3a940384895 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 23 Jul 2024 16:48:16 +0100 Subject: [PATCH 024/200] fix (#32162) --- src/transformers/models/llama/modeling_llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index ce76d1d1ec1b..adb455acfbbc 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -107,7 +107,7 @@ def __init__( else: # BC: "rope_type" was originally "type" if config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling["type"]) + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) else: self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings From bc2adb0112b6677b0dfb4105c74570a0f92183eb Mon Sep 17 00:00:00 2001 From: Sai-Suraj-27 Date: Tue, 23 Jul 2024 21:22:41 +0530 Subject: [PATCH 025/200] fix: Fixed an if condition that is always evaluating to true (#32160) Fixed an if condition always evaluating to true. --- src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py index a407a67f3813..16e4d71212b5 100644 --- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py +++ b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py @@ -43,7 +43,7 @@ def get_dpt_config(checkpoint_url): config.neck_hidden_sizes = [256, 512, 1024, 1024] expected_shape = (1, 384, 384) - if "nyu" or "midas" in checkpoint_url: + if "nyu" in checkpoint_url or "midas" in checkpoint_url: config.hidden_size = 768 config.reassemble_factors = [1, 1, 1, 0.5] config.neck_hidden_sizes = [256, 512, 768, 768] From c85510f958e6955d88ea1bafb4f320074bfbd0c1 Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Wed, 24 Jul 2024 00:47:51 +0800 Subject: [PATCH 026/200] [docs] change temperature to a positive value (#32077) fix --- docs/source/en/conversations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/conversations.md b/docs/source/en/conversations.md index 9336503ad7cb..a48c046b4949 100644 --- a/docs/source/en/conversations.md +++ b/docs/source/en/conversations.md @@ -195,7 +195,7 @@ inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()} print("Tokenized inputs:\n", inputs) # 4: Generate text from the model -outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.) +outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1) print("Generated tokens:\n", outputs) # 5: Decode the output back to a string From 01be5b48790f113b7d71943b580c842e3e097988 Mon Sep 17 00:00:00 2001 From: Rohit Dwivedula <25080952+rohitdwivedula@users.noreply.github.com> Date: Wed, 24 Jul 2024 02:09:59 -0500 Subject: [PATCH 027/200] adds: extra_repr() to MambaRMSNorm to include hidden size / size of weights in the layer (#32171) * adds: extra_repr() to MambaRMSNorm to include the hidden size of the layer * style fix with ruff: --- src/transformers/models/mamba/modeling_mamba.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py index 50c0f9ebe4a5..fb519bee3da0 100644 --- a/src/transformers/models/mamba/modeling_mamba.py +++ b/src/transformers/models/mamba/modeling_mamba.py @@ -327,6 +327,9 @@ def forward(self, hidden_states): hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) return self.weight * hidden_states.to(input_dtype) + def extra_repr(self): + return f"{self.weight.shape[0]}, eps={self.variance_epsilon}" + class MambaBlock(nn.Module): def __init__(self, config, layer_idx): From 8678879f1dc2578cec18232146bf19de97aecaa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=A1=B0=EC=A4=80=EB=9E=98?= Date: Wed, 24 Jul 2024 19:38:49 +0900 Subject: [PATCH 028/200] fix: default value reflects the runtime environment variables rather than the ones present at import time. (#32153) * fix: default value reflects the runtime environment variables rather than the ones present at import time. * Fix: Change `deterministic` to None by default; use env var if None --- src/transformers/modeling_flash_attention_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py index 88dd99e6901d..4f0ff8817b7e 100644 --- a/src/transformers/modeling_flash_attention_utils.py +++ b/src/transformers/modeling_flash_attention_utils.py @@ -193,7 +193,7 @@ def _flash_attention_forward( sliding_window: Optional[int] = None, use_top_left_mask: bool = False, softcap: Optional[float] = None, - deterministic: bool = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1", + deterministic: bool = None, ): """ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token @@ -233,6 +233,8 @@ def _flash_attention_forward( flash_kwargs = {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {} if is_flash_attn_greater_or_equal("2.4.1"): + if deterministic is None: + deterministic = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1" flash_kwargs["deterministic"] = deterministic if softcap is not None: From 5f4ee98a7ade33e1c54fdd6181d04ee7b426b392 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dr=2E=20Artificial=E6=9B=BE=E5=B0=8F=E5=81=A5?= <875100501@qq.com> Date: Wed, 24 Jul 2024 18:54:41 +0800 Subject: [PATCH 029/200] Update qwen2.md (#32108) * Update qwen2.md outdated description * Update qwen2.md amended * Update qwen2.md Update * Update qwen2.md fix wrong version code, now good to go --- docs/source/en/model_doc/qwen2.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index ac0e25e02c35..16815f2fc1f3 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -18,7 +18,7 @@ rendered properly in your Markdown viewer. ## Overview -Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen-72B, Qwen-1.8B, Qwen-VL, Qwen-Audio, etc. +Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen2-0.5B, Qwen2-1.5B, Qwen2-7B, Qwen2-57B-A14B, Qwen2-72B, Qwen2-Audio, etc. ### Model Details @@ -27,16 +27,16 @@ Qwen2 is a language model series including decoder language models of different ## Usage tips -`Qwen2-7B-beta` and `Qwen2-7B-Chat-beta` can be found on the [Huggingface Hub](https://huggingface.co/Qwen) +`Qwen2-7B` and `Qwen2-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen) -In the following, we demonstrate how to use `Qwen2-7B-Chat-beta` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose. +In the following, we demonstrate how to use `Qwen2-7B-Instruct` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose. ```python >>> from transformers import AutoModelForCausalLM, AutoTokenizer >>> device = "cuda" # the device to load the model onto ->>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-7B-Chat", device_map="auto") ->>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat") +>>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-7B-Instruct", device_map="auto") +>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct") >>> prompt = "Give me a short introduction to large language model." From 165116bc145dcc186fa287e624b28a9ab3a79955 Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Wed, 24 Jul 2024 14:03:40 +0100 Subject: [PATCH 030/200] Remove conversational pipeline tests (#32099) Remove conversation pipeline tests --- .../test_pipelines_conversational.py | 439 ------------------ 1 file changed, 439 deletions(-) delete mode 100644 tests/pipelines/test_pipelines_conversational.py diff --git a/tests/pipelines/test_pipelines_conversational.py b/tests/pipelines/test_pipelines_conversational.py deleted file mode 100644 index 5b6eb514b1a9..000000000000 --- a/tests/pipelines/test_pipelines_conversational.py +++ /dev/null @@ -1,439 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -from transformers import ( - MODEL_FOR_CAUSAL_LM_MAPPING, - MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, - TF_MODEL_FOR_CAUSAL_LM_MAPPING, - TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, - AutoModelForCausalLM, - AutoModelForSeq2SeqLM, - AutoTokenizer, - BlenderbotSmallForConditionalGeneration, - BlenderbotSmallTokenizer, - Conversation, - ConversationalPipeline, - TFAutoModelForCausalLM, - pipeline, -) -from transformers.testing_utils import ( - backend_empty_cache, - is_pipeline_test, - is_torch_available, - require_tf, - require_torch, - slow, - torch_device, -) - -from .test_pipelines_common import ANY - - -@is_pipeline_test -class ConversationalPipelineTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - if is_torch_available(): - backend_empty_cache(torch_device) - - model_mapping = dict( - list(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items()) - if MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING - else [] + list(MODEL_FOR_CAUSAL_LM_MAPPING.items()) - if MODEL_FOR_CAUSAL_LM_MAPPING - else [] - ) - tf_model_mapping = dict( - list(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items()) - if TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING - else [] + list(TF_MODEL_FOR_CAUSAL_LM_MAPPING.items()) - if TF_MODEL_FOR_CAUSAL_LM_MAPPING - else [] - ) - - def get_test_pipeline(self, model, tokenizer, processor): - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) - return conversation_agent, [Conversation("Hi there!")] - - def run_pipeline_test(self, conversation_agent, _): - # Simple - outputs = conversation_agent(Conversation("Hi there!"), max_new_tokens=5) - self.assertEqual( - outputs, - Conversation([{"role": "user", "content": "Hi there!"}, {"role": "assistant", "content": ANY(str)}]), - ) - - # Single list - outputs = conversation_agent([Conversation("Hi there!")], max_new_tokens=5) - self.assertEqual( - outputs, - Conversation([{"role": "user", "content": "Hi there!"}, {"role": "assistant", "content": ANY(str)}]), - ) - - # Batch - conversation_1 = Conversation("Going to the movies tonight - any suggestions?") - conversation_2 = Conversation("What's the last book you have read?") - self.assertEqual(len(conversation_1), 1) - self.assertEqual(len(conversation_2), 1) - - outputs = conversation_agent([conversation_1, conversation_2], max_new_tokens=5) - self.assertEqual(outputs, [conversation_1, conversation_2]) - self.assertEqual( - outputs, - [ - Conversation( - [ - {"role": "user", "content": "Going to the movies tonight - any suggestions?"}, - {"role": "assistant", "content": ANY(str)}, - ], - ), - Conversation( - [ - {"role": "user", "content": "What's the last book you have read?"}, - {"role": "assistant", "content": ANY(str)}, - ] - ), - ], - ) - - # One conversation with history - conversation_2.add_message({"role": "user", "content": "Why do you recommend it?"}) - outputs = conversation_agent(conversation_2, max_new_tokens=5) - self.assertEqual(outputs, conversation_2) - self.assertEqual( - outputs, - Conversation( - [ - {"role": "user", "content": "What's the last book you have read?"}, - {"role": "assistant", "content": ANY(str)}, - {"role": "user", "content": "Why do you recommend it?"}, - {"role": "assistant", "content": ANY(str)}, - ] - ), - ) - - @require_torch - @slow - def test_integration_torch_conversation(self): - # When - conversation_agent = pipeline(task="conversational", device=torch_device) - conversation_1 = Conversation("Going to the movies tonight - any suggestions?") - conversation_2 = Conversation("What's the last book you have read?") - # Then - self.assertEqual(len(conversation_1.past_user_inputs), 0) - self.assertEqual(len(conversation_2.past_user_inputs), 0) - # When - result = conversation_agent([conversation_1, conversation_2], do_sample=False, max_length=1000) - # Then - self.assertEqual(result, [conversation_1, conversation_2]) - self.assertEqual(len(result[0].past_user_inputs), 1) - self.assertEqual(len(result[1].past_user_inputs), 1) - self.assertEqual(len(result[0].generated_responses), 1) - self.assertEqual(len(result[1].generated_responses), 1) - self.assertEqual(result[0].past_user_inputs[0], "Going to the movies tonight - any suggestions?") - self.assertEqual(result[0].generated_responses[0], "The Big Lebowski") - self.assertEqual(result[1].past_user_inputs[0], "What's the last book you have read?") - self.assertEqual(result[1].generated_responses[0], "The Last Question") - # When - conversation_2.add_user_input("Why do you recommend it?") - result = conversation_agent(conversation_2, do_sample=False, max_length=1000) - # Then - self.assertEqual(result, conversation_2) - self.assertEqual(len(result.past_user_inputs), 2) - self.assertEqual(len(result.generated_responses), 2) - self.assertEqual(result.past_user_inputs[1], "Why do you recommend it?") - self.assertEqual(result.generated_responses[1], "It's a good book.") - - @require_torch - @slow - def test_integration_torch_conversation_truncated_history(self): - # When - conversation_agent = pipeline(task="conversational", min_length_for_response=24, device=torch_device) - conversation_1 = Conversation("Going to the movies tonight - any suggestions?") - # Then - self.assertEqual(len(conversation_1.past_user_inputs), 0) - # When - result = conversation_agent(conversation_1, do_sample=False, max_length=36) - # Then - self.assertEqual(result, conversation_1) - self.assertEqual(len(result.past_user_inputs), 1) - self.assertEqual(len(result.generated_responses), 1) - self.assertEqual(result.past_user_inputs[0], "Going to the movies tonight - any suggestions?") - self.assertEqual(result.generated_responses[0], "The Big Lebowski") - # When - conversation_1.add_user_input("Is it an action movie?") - result = conversation_agent(conversation_1, do_sample=False, max_length=36) - # Then - self.assertEqual(result, conversation_1) - self.assertEqual(len(result.past_user_inputs), 2) - self.assertEqual(len(result.generated_responses), 2) - self.assertEqual(result.past_user_inputs[1], "Is it an action movie?") - self.assertEqual(result.generated_responses[1], "It's a comedy.") - - @require_torch - def test_small_model_pt(self): - tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small") - model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small") - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) - conversation = Conversation("hello") - output = conversation_agent(conversation) - self.assertEqual(output, Conversation(past_user_inputs=["hello"], generated_responses=["Hi"])) - - @require_tf - def test_small_model_tf(self): - tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small") - model = TFAutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small") - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) - conversation = Conversation("hello") - output = conversation_agent(conversation) - self.assertEqual(output, Conversation(past_user_inputs=["hello"], generated_responses=["Hi"])) - - @require_torch - @slow - def test_integration_torch_conversation_dialogpt_input_ids(self): - tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small") - model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small") - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) - - conversation_1 = Conversation("hello") - inputs = conversation_agent.preprocess(conversation_1) - self.assertEqual(inputs["input_ids"].tolist(), [[31373, 50256]]) - - conversation_2 = Conversation("how are you ?", past_user_inputs=["hello"], generated_responses=["Hi there!"]) - inputs = conversation_agent.preprocess(conversation_2) - self.assertEqual( - inputs["input_ids"].tolist(), [[31373, 50256, 17250, 612, 0, 50256, 4919, 389, 345, 5633, 50256]] - ) - - @unittest.skip(reason="Model is curently gated") - @require_torch - @slow - def test_integration_torch_conversation_llama2_input_ids(self): - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_default_system_prompt=True) - - conversation = Conversation( - "What is so great about #1?", - past_user_inputs=["I am going to Paris, what should I see?"], - generated_responses=[ - """\ -Paris, the capital of France, is known for its stunning architecture, art museums, historical landmarks, and romantic atmosphere. Here are some of the top attractions to see in Paris: - -1. The Eiffel Tower: The iconic Eiffel Tower is one of the most recognizable landmarks in the world and offers breathtaking views of the city. -2. The Louvre Museum: The Louvre is one of the world's largest and most famous museums, housing an impressive collection of art and artifacts, including the Mona Lisa. -3. Notre-Dame Cathedral: This beautiful cathedral is one of the most famous landmarks in Paris and is known for its Gothic architecture and stunning stained glass windows. - -These are just a few of the many attractions that Paris has to offer. With so much to see and do, it's no wonder that Paris is one of the most popular tourist destinations in the world.""" - ], - ) - inputs = tokenizer._build_conversation_input_ids(conversation) - EXPECTED_INPUTS_IDS = [ 1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889, 29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641, 9109, 29889, 29871, 3575, 6089, 881, 451, 3160, 738, 10311, 1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916, 391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793, 29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443, 5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644, 263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338, 451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012, 310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915, 29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016, 29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 29902, 626, 2675, 304, 3681, 29892, 825, 881, 306, 1074, 29973, 518, 29914, 25580, 29962, 3681, 29892, 278, 7483, 310, 3444, 29892, 338, 2998, 363, 967, 380, 27389, 11258, 29892, 1616, 19133, 29879, 29892, 15839, 2982, 22848, 29892, 322, 6017, 7716, 25005, 29889, 2266, 526, 777, 310, 278, 2246, 19650, 1953, 304, 1074, 297, 3681, 29901, 13, 13, 29896, 29889, 450, 382, 2593, 295, 23615, 29901, 450, 9849, 293, 382, 2593, 295, 23615, 338, 697, 310, 278, 1556, 5936, 13902, 2982, 22848, 297, 278, 3186, 322, 16688, 2078, 271, 400, 5086, 8386, 310, 278, 4272, 29889, 13, 29906, 29889, 450, 4562, 12675, 6838, 29901, 450, 4562, 12675, 338, 697, 310, 278, 3186, 29915, 29879, 10150, 322, 1556, 13834, 19133, 29879, 29892, 27261, 385, 21210, 573, 4333, 310, 1616, 322, 24238, 29879, 29892, 3704, 278, 2598, 29874, 29420, 29889, 13, 29941, 29889, 24337, 29899, 29928, 420, 315, 21471, 29901, 910, 9560, 274, 21471, 338, 697, 310, 278, 1556, 13834, 2982, 22848, 297, 3681, 322, 338, 2998, 363, 967, 22883, 293, 11258, 322, 380, 27389, 380, 7114, 12917, 5417, 29889, 13, 13, 1349, 968, 526, 925, 263, 2846, 310, 278, 1784, 19650, 1953, 393, 3681, 756, 304, 5957, 29889, 2973, 577, 1568, 304, 1074, 322, 437, 29892, 372, 29915, 29879, 694, 4997, 393, 3681, 338, 697, 310, 278, 1556, 5972, 6282, 391, 15422, 800, 297, 278, 3186, 29889, 29871, 2, 1, 518, 25580, 29962, 1724, 338, 577, 2107, 1048, 396, 29896, 29973, 518, 29914, 25580, 29962] # fmt: skip - self.assertEqual(inputs, EXPECTED_INPUTS_IDS) - - model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf") - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) - EXPECTED_TEXT = "what topic you want to focus on and create content around it. This will help you stand out from other creators and attract a specific audience.\n\nStep 2: Set Up Your Channel\nCreate your YouTube account and customize your channel with your branding and logo. Make sure your channel name and profile picture are consistent with your niche.\n\nStep 3: Plan Your Content\nDevelop a content strategy that includes the type of content you want to create, how often you will post, and when you will post. Consider creating a content calendar to help you stay organized.\n\nStep 4: Invest in Quality Equipment\nInvest in good quality camera and microphone equipment to ensure your videos look and sound professional. You don't need to break the bank, but investing in good equipment will make a big difference in the quality of your videos.\n\nStep 5: Optimize Your Videos for Search\nUse keywords in your video titles, descriptions, and tags to help people find your videos when they search for topics related to your niche" - conversation = Conversation( - "<>\n Only answer with emojis, and charades\n<>\n\nHow can I build a house in 10 steps?" - ) - result = conversation_agent(conversation) - self.assertEqual(result.generated_responses[-1], EXPECTED_TEXT) - - @require_torch - @slow - def test_integration_torch_conversation_blenderbot_400M_input_ids(self): - tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") - model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot-400M-distill") - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) - - # test1 - conversation_1 = Conversation("hello") - inputs = conversation_agent.preprocess(conversation_1) - self.assertEqual(inputs["input_ids"].tolist(), [[1710, 86, 2]]) - - # test2 - conversation_1 = Conversation( - "I like lasagne.", - past_user_inputs=["hello"], - generated_responses=[ - " Do you like lasagne? It is a traditional Italian dish consisting of a shepherd's pie." - ], - ) - inputs = conversation_agent.preprocess(conversation_1) - self.assertEqual( - inputs["input_ids"].tolist(), - [ - # This should be compared with the same conversation on ParlAI `safe_interactive` demo. - [ - 1710, # hello - 86, - 228, # Double space - 228, - 946, - 304, - 398, - 6881, - 558, - 964, - 38, - 452, - 315, - 265, - 6252, - 452, - 322, - 968, - 6884, - 3146, - 278, - 306, - 265, - 617, - 87, - 388, - 75, - 341, - 286, - 521, - 21, - 228, # Double space - 228, - 281, # I like lasagne. - 398, - 6881, - 558, - 964, - 21, - 2, # EOS - ], - ], - ) - - @require_torch - @slow - def test_integration_torch_conversation_blenderbot_400M(self): - tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") - model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot-400M-distill") - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) - - conversation_1 = Conversation("hello") - result = conversation_agent( - conversation_1, - ) - self.assertEqual( - result.generated_responses[0], - # ParlAI implementation output, we have a different one, but it's our - # second best, you can check by using num_return_sequences=10 - # " Hello! How are you? I'm just getting ready to go to work, how about you?", - " Hello! How are you doing today? I just got back from a walk with my dog.", - ) - - conversation_1 = Conversation("Lasagne hello") - result = conversation_agent(conversation_1, encoder_no_repeat_ngram_size=3) - self.assertEqual( - result.generated_responses[0], - " Do you like lasagne? It is a traditional Italian dish consisting of a shepherd's pie.", - ) - - conversation_1 = Conversation( - "Lasagne hello Lasagne is my favorite Italian dish. Do you like lasagne? I like lasagne." - ) - result = conversation_agent( - conversation_1, - encoder_no_repeat_ngram_size=3, - ) - self.assertEqual( - result.generated_responses[0], - " Me too. I like how it can be topped with vegetables, meats, and condiments.", - ) - - @require_torch - @slow - def test_integration_torch_conversation_encoder_decoder(self): - # When - tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M") - model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot_small-90M") - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer, device=torch_device) - - conversation_1 = Conversation("My name is Sarah and I live in London") - conversation_2 = Conversation("Going to the movies tonight, What movie would you recommend? ") - # Then - self.assertEqual(len(conversation_1.past_user_inputs), 0) - self.assertEqual(len(conversation_2.past_user_inputs), 0) - # When - result = conversation_agent([conversation_1, conversation_2], do_sample=False, max_length=1000) - # Then - self.assertEqual(result, [conversation_1, conversation_2]) - self.assertEqual(len(result[0].past_user_inputs), 1) - self.assertEqual(len(result[1].past_user_inputs), 1) - self.assertEqual(len(result[0].generated_responses), 1) - self.assertEqual(len(result[1].generated_responses), 1) - self.assertEqual(result[0].past_user_inputs[0], "My name is Sarah and I live in London") - self.assertEqual( - result[0].generated_responses[0], - "hi sarah, i live in london as well. do you have any plans for the weekend?", - ) - self.assertEqual( - result[1].past_user_inputs[0], "Going to the movies tonight, What movie would you recommend? " - ) - self.assertEqual( - result[1].generated_responses[0], "i don't know... i'm not really sure. what movie are you going to see?" - ) - # When - conversation_1.add_user_input("Not yet, what about you?") - conversation_2.add_user_input("What's your name?") - result = conversation_agent([conversation_1, conversation_2], do_sample=False, max_length=1000) - # Then - self.assertEqual(result, [conversation_1, conversation_2]) - self.assertEqual(len(result[0].past_user_inputs), 2) - self.assertEqual(len(result[1].past_user_inputs), 2) - self.assertEqual(len(result[0].generated_responses), 2) - self.assertEqual(len(result[1].generated_responses), 2) - self.assertEqual(result[0].past_user_inputs[1], "Not yet, what about you?") - self.assertEqual(result[0].generated_responses[1], "i don't have any plans yet. i'm not sure what to do yet.") - self.assertEqual(result[1].past_user_inputs[1], "What's your name?") - self.assertEqual(result[1].generated_responses[1], "i don't have a name, but i'm going to see a horror movie.") - - @require_torch - @slow - def test_from_pipeline_conversation(self): - model_id = "facebook/blenderbot_small-90M" - - # from model id - conversation_agent_from_model_id = pipeline("conversational", model=model_id, tokenizer=model_id) - - # from model object - model = BlenderbotSmallForConditionalGeneration.from_pretrained(model_id) - tokenizer = BlenderbotSmallTokenizer.from_pretrained(model_id) - conversation_agent_from_model = pipeline("conversational", model=model, tokenizer=tokenizer) - - conversation = Conversation("My name is Sarah and I live in London") - conversation_copy = Conversation("My name is Sarah and I live in London") - - result_model_id = conversation_agent_from_model_id([conversation]) - result_model = conversation_agent_from_model([conversation_copy]) - - # check for equality - self.assertEqual( - result_model_id.generated_responses[0], - "hi sarah, i live in london as well. do you have any plans for the weekend?", - ) - self.assertEqual( - result_model_id.generated_responses[0], - result_model.generated_responses[0], - ) From e0182f3bd7f4753c1e378e052ceea67898d97359 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Wed, 24 Jul 2024 15:00:48 +0100 Subject: [PATCH 031/200] RoPE: relaxed rope validation (#32182) * relaxed rope check * lets also accept rope_type=None, defaulting to the original implementation * type and rope_type can coexist --- src/transformers/modeling_rope_utils.py | 72 +++++++++---------- .../models/llama/configuration_llama.py | 3 + tests/models/llama/test_modeling_llama.py | 54 ++++++++++++++ 3 files changed, 93 insertions(+), 36 deletions(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 14a12b939405..bee0269ff82e 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -354,6 +354,11 @@ def _compute_llama3_parameters( def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, optional_keys: Optional[set] = None): """Compare the received keys in `config.rope_scaling` against the expected and optional keys""" + # BC: "rope_type" was originally "type" -- let's gracefully handle it + if "rope_type" not in received_keys and "type" in received_keys: + received_keys -= {"type"} + received_keys.add("rope_type") + missing_keys = required_keys - received_keys if missing_keys: raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}") @@ -361,14 +366,14 @@ def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, if optional_keys is not None: unused_keys = received_keys - required_keys - optional_keys else: - unused_keys = received_keys - received_keys + unused_keys = received_keys - required_keys if unused_keys: - raise KeyError(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}") + logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}") def _validate_default_rope_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling - rope_type = rope_scaling["rope_type"] + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" required_keys = {"rope_type"} received_keys = set(rope_scaling.keys()) _check_received_keys(rope_type, received_keys, required_keys) @@ -376,19 +381,19 @@ def _validate_default_rope_parameters(config: PretrainedConfig): def _validate_linear_scaling_rope_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling - rope_type = rope_scaling["rope_type"] + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" required_keys = {"rope_type", "factor"} received_keys = set(rope_scaling.keys()) _check_received_keys(rope_type, received_keys, required_keys) factor = rope_scaling["factor"] if factor is None or not isinstance(factor, float) or factor < 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling - rope_type = rope_scaling["rope_type"] + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" required_keys = {"rope_type", "factor"} # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` optional_keys = {"original_max_position_embeddings"} @@ -397,12 +402,12 @@ def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig): factor = rope_scaling["factor"] if factor is None or not isinstance(factor, float) or factor < 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") def _validate_yarn_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling - rope_type = rope_scaling["rope_type"] + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" required_keys = {"rope_type", "factor"} optional_keys = {"attention_factor", "beta_fast", "beta_slow"} received_keys = set(rope_scaling.keys()) @@ -410,22 +415,22 @@ def _validate_yarn_parameters(config: PretrainedConfig): factor = rope_scaling["factor"] if factor is None or not isinstance(factor, float) or factor < 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") attention_factor = rope_scaling.get("attention_factor") if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0): - raise ValueError( + logger.warning( f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" ) beta_fast = rope_scaling.get("beta_fast") if beta_fast is not None and not isinstance(beta_fast, float): - raise ValueError(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}") + logger.warning(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}") beta_slow = rope_scaling.get("beta_slow") if beta_slow is not None and not isinstance(beta_slow, float): - raise ValueError(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}") + logger.warning(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}") if (beta_fast or 32) < (beta_slow or 1): - raise ValueError( + logger.warning( f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} " f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)" ) @@ -433,7 +438,7 @@ def _validate_yarn_parameters(config: PretrainedConfig): def _validate_longrope_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling - rope_type = rope_scaling["rope_type"] + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" required_keys = {"rope_type", "short_factor", "long_factor"} # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"} @@ -445,15 +450,15 @@ def _validate_longrope_parameters(config: PretrainedConfig): short_factor = rope_scaling.get("short_factor") if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor): - raise ValueError(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}") + logger.warning(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}") if not len(short_factor) == dim // 2: - raise ValueError(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}") + logger.warning(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}") long_factor = rope_scaling.get("long_factor") if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor): - raise ValueError(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}") + logger.warning(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}") if not len(long_factor) == dim // 2: - raise ValueError(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}") + logger.warning(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}") # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is @@ -468,48 +473,48 @@ def _validate_longrope_parameters(config: PretrainedConfig): else: factor = rope_scaling.get("factor") if factor is None: - raise ValueError("Missing required keys in `rope_scaling`: 'factor'") + logger.warning("Missing required keys in `rope_scaling`: 'factor'") elif not isinstance(factor, float) or factor < 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") attention_factor = rope_scaling.get("attention_factor") if attention_factor is not None and not isinstance(attention_factor, float) or attention_factor < 0: - raise ValueError( + logger.warning( f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" ) def _validate_llama3_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling - rope_type = rope_scaling["rope_type"] + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"} received_keys = set(rope_scaling.keys()) _check_received_keys(rope_type, received_keys, required_keys) factor = rope_scaling["factor"] if factor is None or not isinstance(factor, float) or factor < 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") low_freq_factor = rope_scaling["low_freq_factor"] high_freq_factor = rope_scaling["high_freq_factor"] if low_freq_factor is None or not isinstance(low_freq_factor, float): - raise ValueError(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}") + logger.warning(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}") if high_freq_factor is None or not isinstance(high_freq_factor, float): - raise ValueError(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}") + logger.warning(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}") if high_freq_factor < low_freq_factor: - raise ValueError( + logger.warning( "`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=" f"{high_freq_factor} and low_freq_factor={low_freq_factor}" ) original_max_position_embeddings = rope_scaling["original_max_position_embeddings"] if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int): - raise ValueError( + logger.warning( "`rope_scaling`'s original_max_position_embeddings field must be an integer, got " f"{original_max_position_embeddings}" ) if original_max_position_embeddings >= config.max_position_embeddings: - raise ValueError( + logger.warning( "`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got " f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}" ) @@ -534,17 +539,12 @@ def rope_config_validation(config: PretrainedConfig): if rope_scaling is None: return - possible_rope_types = set(ROPE_INIT_FUNCTIONS.keys()) - rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" - if rope_type is None: - raise ValueError( - f"rope_scaling must contain a non-None 'rope_type' field. Possible options are {possible_rope_types}" - ) - + # BC: "rope_type" was originally "type" + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default")) validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type) if validation_fn is not None: validation_fn(config) else: - raise ValueError( + logger.warning( f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'" ) diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index c632a870be7a..710809093f38 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -189,6 +189,9 @@ def __init__( self.mlp_bias = mlp_bias # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, move it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] rope_config_validation(self) super().__init__( diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index 85d352fc814f..19cb7bd6be39 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -526,6 +526,60 @@ def test_rope_class_retrocompatibility(self): torch.testing.assert_close(old_cos_long, new_cos_long) torch.testing.assert_close(old_sin_long, new_sin_long) + def test_model_loading_old_rope_configs(self): + def _reinitialize_config(base_config, new_kwargs): + # Reinitialize the config with the new kwargs, forcing the config to go through its __init__ validation + # steps. + base_config_dict = base_config.to_dict() + new_config = LlamaConfig.from_dict(config_dict={**base_config_dict, **new_kwargs}) + return new_config + + # from untouched config -> ✅ + base_config, model_inputs = self.model_tester.prepare_config_and_inputs_for_common() + original_model = LlamaForCausalLM(base_config).to(torch_device) + original_model(**model_inputs) + + # from a config with the expected rope configuration -> ✅ + config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear", "factor": 10.0}}) + original_model = LlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + + # from a config with the old rope configuration ('type' instead of 'rope_type') -> ✅ we gracefully handle BC + config = _reinitialize_config(base_config, {"rope_scaling": {"type": "linear", "factor": 10.0}}) + original_model = LlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + + # from a config with both 'type' and 'rope_type' -> ✅ they can coexist (and both are present in the config) + config = _reinitialize_config( + base_config, {"rope_scaling": {"type": "linear", "rope_type": "linear", "factor": 10.0}} + ) + self.assertTrue(config.rope_scaling["type"] == "linear") + self.assertTrue(config.rope_scaling["rope_type"] == "linear") + original_model = LlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + + # from a config with parameters in a bad range ('factor' should be >= 1.0) -> ⚠️ throws a warning + with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: + config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear", "factor": -999.0}}) + original_model = LlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + self.assertEqual(len(logs.output), 1) + self.assertIn("factor field", logs.output[0]) + + # from a config with unknown parameters ('foo' isn't a rope option) -> ⚠️ throws a warning + with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: + config = _reinitialize_config( + base_config, {"rope_scaling": {"rope_type": "linear", "factor": 10.0, "foo": "bar"}} + ) + original_model = LlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + self.assertEqual(len(logs.output), 1) + self.assertIn("Unrecognized keys", logs.output[0]) + + # from a config with specific rope type but missing one of its mandatory parameters -> ❌ throws exception + with self.assertRaises(KeyError): + config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}}) # missing "factor" + @require_flash_attn @require_torch_gpu @require_bitsandbytes From 8d2534c4d0ab94a97a72d2ce6bb9ccd201abadb3 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Wed, 24 Jul 2024 16:06:39 +0200 Subject: [PATCH 032/200] let's not warn when someone is running a forward (#32176) * let's not warn when someone is running a foward without cache + self.training * more models * fixup --- src/transformers/models/cohere/modeling_cohere.py | 4 +++- src/transformers/models/dbrx/modeling_dbrx.py | 4 +++- src/transformers/models/gemma/diff_gemma.py | 4 +++- src/transformers/models/gemma/modeling_gemma.py | 8 ++++++-- src/transformers/models/jetmoe/modeling_jetmoe.py | 4 +++- src/transformers/models/llama/modeling_llama.py | 4 +++- src/transformers/models/mistral/modeling_mistral.py | 2 +- src/transformers/models/mixtral/modeling_mixtral.py | 2 +- src/transformers/models/olmo/modeling_olmo.py | 4 +++- src/transformers/models/persimmon/modeling_persimmon.py | 2 +- src/transformers/models/phi/modeling_phi.py | 2 +- src/transformers/models/phi3/modeling_phi3.py | 2 +- src/transformers/models/qwen2/modeling_qwen2.py | 2 +- src/transformers/models/qwen2_moe/modeling_qwen2_moe.py | 2 +- src/transformers/models/stablelm/modeling_stablelm.py | 2 +- src/transformers/models/starcoder2/modeling_starcoder2.py | 2 +- 16 files changed, 33 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 6532c656d453..6257eeb9958c 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -769,7 +769,9 @@ def forward( past_seen_tokens = 0 return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index e3be8decbc6b..b1f3ce1b8ba9 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -1005,7 +1005,9 @@ def forward( inputs_embeds = nn.functional.dropout(inputs_embeds, p=self.emb_pdrop, training=self.training) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/gemma/diff_gemma.py b/src/transformers/models/gemma/diff_gemma.py index d2a653120965..4e2ea82950bf 100644 --- a/src/transformers/models/gemma/diff_gemma.py +++ b/src/transformers/models/gemma/diff_gemma.py @@ -474,7 +474,9 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False # noqa: F841 - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True # noqa: F841 past_key_values = DynamicCache.from_legacy_cache(past_key_values) diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 5bc1af3e7ec7..ae27fe98512f 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -770,7 +770,9 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False # noqa: F841 - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True # noqa: F841 past_key_values = DynamicCache.from_legacy_cache(past_key_values) @@ -795,7 +797,9 @@ def forward( # See https://github.com/huggingface/transformers/pull/29402 normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype) hidden_states = hidden_states * normalizer - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index fa15393a40a5..583751520183 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -978,7 +978,9 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index adb455acfbbc..d553ce4432f9 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -894,7 +894,9 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 93a60a49dbf3..7c339271a589 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -758,7 +758,7 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: past_key_values = DynamicCache.from_legacy_cache(past_key_values) return_legacy_cache = True logger.warning_once( diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index d2ee6e6b268a..7df175a0467d 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -960,7 +960,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 74d49d5606c1..16e8711188dd 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -811,7 +811,9 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index af22145e3e9d..c718b7a40633 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -626,7 +626,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 1b23be39e5c0..1289910381e5 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -909,7 +909,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 90b815184b07..dfcb7c2dd009 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -950,7 +950,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 1ff8896ae5f9..67c8c71fe59d 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -808,7 +808,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 54e91da3347d..9b50235c15d5 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -970,7 +970,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index 3a3b6a9e05f1..086f525644b2 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -902,7 +902,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index f2786f9df48a..18dbe510450f 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -784,7 +784,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( From 1392a6867f40a55dfabaf306745c67627598b1af Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Wed, 24 Jul 2024 19:26:20 +0500 Subject: [PATCH 033/200] Fix resize embedding with Deepspeed (#32192) fix resize when deepspeed --- src/transformers/modeling_utils.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 81403f524f9e..9f32da16fd79 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2131,13 +2131,23 @@ def _get_resized_embeddings( # Replace weights in old_embeddings and return to maintain the same embedding type. # This ensures correct functionality when a Custom Embedding class is passed as input. # The input and output embedding types remain consistent. (c.f. https://github.com/huggingface/transformers/pull/31979) - old_embeddings.weight.data = new_embeddings.weight.data - old_embeddings.num_embeddings = new_embeddings.weight.data.shape[0] + if is_deepspeed_zero3_enabled() and not is_quantized: + import deepspeed + + params = [old_embeddings.weight, new_embeddings.weight] + with deepspeed.zero.GatheredParameters(params, modifier_rank=0): + old_embeddings.weight.data = new_embeddings.weight.data + old_embeddings.num_embeddings = new_embeddings.weight.data.shape[0] - # If the new number of tokens is smaller than the original `padding_idx`, the `padding_idx` - # will be set to `None` in the resized embeddings. - if old_embeddings.padding_idx is not None and (new_num_tokens - 1) < old_embeddings.padding_idx: - old_embeddings.padding_idx = None + # If the new number of tokens is smaller than the original `padding_idx`, the `padding_idx` + # will be set to `None` in the resized embeddings. + if old_embeddings.padding_idx is not None and (new_num_tokens - 1) < old_embeddings.padding_idx: + old_embeddings.padding_idx = None + else: + old_embeddings.weight.data = new_embeddings.weight.data + old_embeddings.num_embeddings = new_embeddings.weight.data.shape[0] + if old_embeddings.padding_idx is not None and (new_num_tokens - 1) < old_embeddings.padding_idx: + old_embeddings.padding_idx = None return old_embeddings From af0e4b7b37b2d7eefe7531cf5201a5d6bae85525 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Wed, 24 Jul 2024 17:14:05 +0200 Subject: [PATCH 034/200] Fix float8_e4m3fn in modeling_utils (#32193) * Fix float8_e4m3fn in modeling_utils * style * fix * comment --- src/transformers/modeling_utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 9f32da16fd79..8f1ad56f6999 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -855,6 +855,8 @@ def _load_state_dict_into_meta_model( for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) + is_torch_e4m3fn_available = hasattr(torch, "float8_e4m3fn") + for param_name, param in state_dict.items(): # First part of the test is always true as load_state_dict_keys always contains state_dict keys. if param_name not in loaded_state_dict_keys or param_name not in expected_keys: @@ -866,9 +868,10 @@ def _load_state_dict_into_meta_model( module_name = param_name set_module_kwargs = {} - # We convert floating dtypes to the `dtype` passed. We want to keep the buffers/params + # We convert floating dtypes to the `dtype` passed except for float8_e4m3fn type. We also want to keep the buffers/params # in int/uint/bool and not cast them. - if dtype is not None and torch.is_floating_point(param) and param.dtype != torch.float8_e4m3fn: + is_param_float8_e4m3fn = is_torch_e4m3fn_available and param.dtype == torch.float8_e4m3fn + if dtype is not None and torch.is_floating_point(param) and not is_param_float8_e4m3fn: if ( keep_in_fp32_modules is not None and any( From 1c122a46dc3c4448901f8d2f3018d9d58b846ba5 Mon Sep 17 00:00:00 2001 From: Penut Chen <94501378+PenutChen@users.noreply.github.com> Date: Wed, 24 Jul 2024 23:59:59 +0800 Subject: [PATCH 035/200] Support dequantizing GGUF FP16 format (#31783) * support gguf fp16 * support gguf bf16 with pytorch * add gguf f16 test * remove bf16 --- src/transformers/integrations/ggml.py | 3 +++ tests/quantization/ggml/test_ggml.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 71aa87afa94b..47f3f0cf8d57 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -36,6 +36,7 @@ # Listed here: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md GGML_TYPES = { "F32": 0, + "F16": 1, "Q4_0": 2, "Q8_0": 8, "Q2_K": 10, @@ -489,6 +490,8 @@ def dequantize_q5_k(data): def load_dequant_gguf_tensor(shape, ggml_type, data): if ggml_type == GGML_TYPES["F32"]: values = data + elif ggml_type == GGML_TYPES["F16"]: + values = data elif ggml_type == GGML_TYPES["Q8_0"]: values = dequantize_q8_0(data) elif ggml_type == GGML_TYPES["Q4_0"]: diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index a5866094a1cc..e42900a1d51b 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -33,6 +33,7 @@ class GgufIntegrationTests(unittest.TestCase): mistral_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF" qwen2_model_id = "Qwen/Qwen1.5-0.5B-Chat-GGUF" llama3_model_id = "NousResearch/Meta-Llama-3-8B-GGUF" + tinyllama_model_id = "PenutChen/TinyLlama-1.1B-Chat-v1.0-GGUF" q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf" q4_k_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" @@ -45,6 +46,7 @@ class GgufIntegrationTests(unittest.TestCase): q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf" q4_0_qwen2_model_id = "qwen1_5-0_5b-chat-q4_0.gguf" q4_llama3_model_id = "Meta-Llama-3-8B-Q4_K_M.gguf" + f16_tinyllama_model_id = "TinyLlama-1.1B-Chat-v1.0.FP16.gguf" example_text = "Hello" @@ -149,6 +151,18 @@ def test_q8_0(self): EXPECTED_TEXT = "Hello, World!\n\n5. Use a library" self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + def test_f16(self): + tokenizer = AutoTokenizer.from_pretrained(self.tinyllama_model_id, gguf_file=self.f16_tinyllama_model_id) + model = AutoModelForCausalLM.from_pretrained( + self.tinyllama_model_id, gguf_file=self.f16_tinyllama_model_id + ).to(torch_device) + + text = tokenizer(self.example_text, return_tensors="pt").to(torch_device) + out = model.generate(**text, max_new_tokens=10) + + EXPECTED_TEXT = "Hello, World!\n\n5. Node.js" + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + def test_mistral_q4_0(self): tokenizer = AutoTokenizer.from_pretrained(self.mistral_model_id, gguf_file=self.q4_0_mistral_model_id) model = AutoModelForCausalLM.from_pretrained( From edd68f4ed8db241bd3e9dc6c4ed96d471f243c9a Mon Sep 17 00:00:00 2001 From: Matt Date: Wed, 24 Jul 2024 17:36:32 +0100 Subject: [PATCH 036/200] :rotating_light: No more default chat templates (#31733) * No more default chat templates * Add the template to the GPT-SW3 tests since it's not available by default now * Fix GPT2 test * Fix Bloom test * Fix Bloom test * Remove default templates again --- docs/source/en/chat_templating.md | 19 +- docs/source/es/chat_templating.md | 8 +- docs/source/ja/chat_templating.md | 2 +- docs/source/zh/chat_templating.md | 2 +- .../blenderbot/tokenization_blenderbot.py | 14 -- .../tokenization_blenderbot_fast.py | 15 -- .../tokenization_blenderbot_small.py | 15 -- .../tokenization_blenderbot_small_fast.py | 15 -- .../models/bloom/tokenization_bloom_fast.py | 8 - .../code_llama/tokenization_code_llama.py | 55 ------ .../tokenization_code_llama_fast.py | 55 ------ .../models/cohere/tokenization_cohere_fast.py | 182 ------------------ .../tokenization_gptsan_japanese.py | 13 -- .../models/gpt2/tokenization_gpt2.py | 7 - .../models/gpt2/tokenization_gpt2_fast.py | 9 - .../gpt_neox/tokenization_gpt_neox_fast.py | 8 - .../tokenization_gpt_neox_japanese.py | 12 -- .../models/gpt_sw3/tokenization_gpt_sw3.py | 16 -- .../models/idefics2/processing_idefics2.py | 57 ------ .../models/llama/tokenization_llama.py | 54 ------ .../models/llama/tokenization_llama_fast.py | 55 ------ .../processing_llava_next_video.py | 60 ------ .../models/whisper/tokenization_whisper.py | 8 - .../whisper/tokenization_whisper_fast.py | 8 - src/transformers/processing_utils.py | 13 +- src/transformers/tokenization_utils_base.py | 54 ++---- tests/models/bloom/test_tokenization_bloom.py | 1 + tests/models/gpt2/test_tokenization_gpt2.py | 1 + .../gpt_sw3/test_tokenization_gpt_sw3.py | 9 + 29 files changed, 28 insertions(+), 747 deletions(-) diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md index d840caaf6605..c4069dd1afc7 100644 --- a/docs/source/en/chat_templating.md +++ b/docs/source/en/chat_templating.md @@ -580,7 +580,7 @@ default template for that model class is used instead. Let's take a look at the >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") ->>> tokenizer.default_chat_template +>>> tokenizer.chat_template "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}" ``` @@ -704,23 +704,6 @@ with other names, pass the name of the template you want to the `chat_template` We find that this can be a bit confusing for users, though - so if you're writing a template yourself, we recommend trying to put it all in a single template where possible! -### What are "default" templates? - -Before the introduction of chat templates, chat handling was hardcoded at the model class level. For backwards -compatibility, we have retained this class-specific handling as default templates, also set at the class level. If a -model does not have a chat template set, but there is a default template for its model class, the `TextGenerationPipeline` -class and methods like `apply_chat_template` will use the class template instead. You can find out what the default -template for your tokenizer is by checking the `tokenizer.default_chat_template` attribute. - -This is something we do purely for backward compatibility reasons, to avoid breaking any existing workflows. Even when -the class template is appropriate for your model, we strongly recommend overriding the default template by -setting the `chat_template` attribute explicitly to make it clear to users that your model has been correctly configured -for chat. - -Now that actual chat templates have been adopted more widely, default templates have been deprecated and will be -removed in a future release. We strongly recommend setting the `chat_template` attribute for any tokenizers that -still depend on them! - ### What template should I use? When setting the template for a model that's already been trained for chat, you should ensure that the template diff --git a/docs/source/es/chat_templating.md b/docs/source/es/chat_templating.md index 10129e87ef11..e287c2137435 100644 --- a/docs/source/es/chat_templating.md +++ b/docs/source/es/chat_templating.md @@ -220,7 +220,7 @@ La plantilla de chat para un modelo se almacena en el atributo `tokenizer.chat_t >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") ->>> tokenizer.default_chat_template +>>> tokenizer.chat_template "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}" ``` @@ -307,12 +307,6 @@ Si estás ajustando finamente un modelo para chat, además de establecer una pla -### ¿Qué son las plantillas "default"? - -Antes de la introducción de las plantillas de chat, el manejo del chat estaba codificado en el nivel de la clase del modelo. Por razones de compatibilidad con versiones anteriores, hemos conservado este manejo específico de la clase como plantillas predeterminadas, también establecidas a nivel de clase. Si un modelo no tiene una plantilla de chat establecida, pero hay una plantilla predeterminada para su clase de modelo, la clase `TextGenerationPipeline` y métodos como `apply_chat_template` usarán la plantilla de clase en su lugar. Puedes averiguar cuál es la plantilla predeterminada para tu tokenizador comprobando el atributo `tokenizer.default_chat_template`. - -Esto es algo que hacemos puramente por razones de compatibilidad con versiones anteriores, para evitar romper cualquier flujo de trabajo existente. Incluso cuando la plantilla de clase es apropiada para tu modelo, recomendamos encarecidamente anular la plantilla predeterminada estableciendo explícitamente el atributo `chat_template` para dejar claro a los usuarios que tu modelo ha sido configurado correctamente para el chat, y para estar preparados para el futuro en caso de que las plantillas predeterminadas alguna vez se alteren o se eliminen. - ### ¿Qué plantilla debería usar? Cuando establezcas la plantilla para un modelo que ya ha sido entrenado para chat, debes asegurarte de que la plantilla coincida exactamente con el formato de mensajes que el modelo vio durante el entrenamiento, o de lo contrario es probable que experimentes degradación del rendimiento. Esto es cierto incluso si estás entrenando aún más el modelo; probablemente obtendrás el mejor rendimiento si mantienes constantes los tokens de chat. Esto es muy análogo a la tokenización: generalmente obtienes el mejor rendimiento para la inferencia o el ajuste fino cuando coincides precisamente con la tokenización utilizada durante el entrenamiento. diff --git a/docs/source/ja/chat_templating.md b/docs/source/ja/chat_templating.md index 200bf40ac4cf..82db942ef1e1 100644 --- a/docs/source/ja/chat_templating.md +++ b/docs/source/ja/chat_templating.md @@ -85,7 +85,7 @@ LLM(Language Model)のますます一般的な使用事例の1つは「チ >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") ->>> tokenizer.default_chat_template +>>> tokenizer.chat_template "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}" ``` diff --git a/docs/source/zh/chat_templating.md b/docs/source/zh/chat_templating.md index a08da47cb27a..e0ab50b634c7 100644 --- a/docs/source/zh/chat_templating.md +++ b/docs/source/zh/chat_templating.md @@ -228,7 +228,7 @@ The sun. >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") ->>> tokenizer.default_chat_template +>>> tokenizer.chat_template "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}" ``` diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py index 677245382334..1a8807214d52 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py @@ -405,17 +405,3 @@ def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ return token_ids_0 + [self.eos_token_id] - - @property - def default_chat_template(self): - """ - A very simple chat template that just adds whitespace between messages. - """ - return ( - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" - "{{ message['content'] }}" - "{% if not loop.last %}{{ ' ' }}{% endif %}" - "{% endfor %}" - "{{ eos_token }}" - ) diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py index 01cbf13809d6..0d24ed62c574 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py @@ -287,18 +287,3 @@ def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ return token_ids_0 + [self.eos_token_id] - - @property - # Copied from transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template - def default_chat_template(self): - """ - A very simple chat template that just adds whitespace between messages. - """ - return ( - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" - "{{ message['content'] }}" - "{% if not loop.last %}{{ ' ' }}{% endif %}" - "{% endfor %}" - "{{ eos_token }}" - ) diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py index 832b5315edfd..08c7be332e31 100644 --- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py @@ -217,18 +217,3 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = index += 1 return vocab_file, merge_file - - @property - # Copied from transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template - def default_chat_template(self): - """ - A very simple chat template that just adds whitespace between messages. - """ - return ( - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" - "{{ message['content'] }}" - "{% if not loop.last %}{{ ' ' }}{% endif %}" - "{% endfor %}" - "{{ eos_token }}" - ) diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py index a80acdb650e4..21fb76cbfc86 100644 --- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py +++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py @@ -98,18 +98,3 @@ def create_token_type_ids_from_sequences( if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - @property - # Copied from transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template - def default_chat_template(self): - """ - A very simple chat template that just adds whitespace between messages. - """ - return ( - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" - "{{ message['content'] }}" - "{% if not loop.last %}{{ ' ' }}{% endif %}" - "{% endfor %}" - "{{ eos_token }}" - ) diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py index d0da1621d4c9..54e637735308 100644 --- a/src/transformers/models/bloom/tokenization_bloom_fast.py +++ b/src/transformers/models/bloom/tokenization_bloom_fast.py @@ -147,11 +147,3 @@ def _encode_plus(self, *args, **kwargs) -> BatchEncoding: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) - - @property - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template - def default_chat_template(self): - """ - A simple chat template that ignores role information and just concatenates messages with EOS tokens. - """ - return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" diff --git a/src/transformers/models/code_llama/tokenization_code_llama.py b/src/transformers/models/code_llama/tokenization_code_llama.py index 5bbf2d0452f4..cc906687874c 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama.py +++ b/src/transformers/models/code_llama/tokenization_code_llama.py @@ -437,61 +437,6 @@ def create_token_type_ids_from_sequences( return output - @property - # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template - def default_chat_template(self): - """ - LLaMA uses [INST] and [/INST] to indicate user messages, and <> and <> to indicate system messages. - Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict - user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering - rather than needing special tokens. The system message is partly 'embedded' in the first user message, which - results in an unusual token ordering when it is present. This template should definitely be changed if you wish - to fine-tune a model with more flexible role ordering! - - The output should look something like: - - [INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer [INST] Prompt [/INST] Answer - [INST] Prompt [/INST] - - The reference for this chat template is [this code - snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) - in the original repository. - """ - template = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}" - "{% set loop_messages = messages %}" # Or use the default system message if the flag is set - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = false %}" - "{% endif %}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message - "{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}" - "{% else %}" - "{% set content = message['content'] %}" - "{% endif %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ ' ' + content.strip() + ' ' + eos_token }}" - "{% endif %}" - "{% endfor %}" - ) - template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false") - default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") - template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) - - return template - def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py index 9bdb7a65b584..b832348d07af 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py +++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py @@ -349,61 +349,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return (out_vocab_file,) - @property - # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template - def default_chat_template(self): - """ - LLaMA uses [INST] and [/INST] to indicate user messages, and <> and <> to indicate system messages. - Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict - user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering - rather than needing special tokens. The system message is partly 'embedded' in the first user message, which - results in an unusual token ordering when it is present. This template should definitely be changed if you wish - to fine-tune a model with more flexible role ordering! - - The output should look something like: - - [INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer [INST] Prompt [/INST] Answer - [INST] Prompt [/INST] - - The reference for this chat template is [this code - snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) - in the original repository. - """ - template = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}" - "{% set loop_messages = messages %}" # Or use the default system message if the flag is set - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = false %}" - "{% endif %}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message - "{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}" - "{% else %}" - "{% set content = message['content'] %}" - "{% endif %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ ' ' + content.strip() + ' ' + eos_token }}" - "{% endif %}" - "{% endfor %}" - ) - template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false") - default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") - template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) - - return template - def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: diff --git a/src/transformers/models/cohere/tokenization_cohere_fast.py b/src/transformers/models/cohere/tokenization_cohere_fast.py index b0a62e279ca8..bac665b473c5 100644 --- a/src/transformers/models/cohere/tokenization_cohere_fast.py +++ b/src/transformers/models/cohere/tokenization_cohere_fast.py @@ -228,188 +228,6 @@ def add_bos_token(self, value): self._add_bos_token = value self.update_post_processor() - @property - def default_chat_template(self): - """ - Cohere Tokenizer uses <|START_OF_TURN_TOKEN|> and <|END_OF_TURN_TOKEN|> to indicate each turn in a chat. - Additioanlly, to indicate the source of the message, <|USER_TOKEN|>, <|CHATBOT_TOKEN|> and <|SYSTEM_TOKEN|> - for user, assitant and system messages respectively. - - The output should look something like: - <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ preamble }}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ How are you? }}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{ I am doing well! }}<|END_OF_TURN_TOKEN|> - - Use add_generation_prompt to add a prompt for the model to generate a response: - >>> from transformers import AutoTokenizer - >>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01") - >>> messages = [{"role": "user", "content": "Hello, how are you?"}] - >>> tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' - - """ - default_template = ( - "{{ bos_token }}" - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% elif USE_DEFAULT_PROMPT == true %}" - "{% set loop_messages = messages %}" # Or use the default system message if the flag is set - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = false %}" - "{% endif %}" - "{% if system_message != false %}" # Start with system message - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}" - "{% endif %}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% set content = message['content'] %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% endif %}" - "{% endfor %}" - "{% if add_generation_prompt %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}" - "{% endif %}" - ) - default_template = default_template.replace( - "USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false" - ) - default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") - default_template = default_template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) - - tool_use_template = ( - "{{ bos_token }}" - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% endif %}" - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}" - "{{ '# Safety Preamble' }}" - "{{ '\nThe instructions in this section override those in the task description and style guide sections. Don\\'t answer questions that are harmful or immoral.' }}" - "{{ '\n\n# System Preamble' }}" - "{{ '\n## Basic Rules' }}" - "{{ '\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user\\'s requests, you cite your sources in your answers, according to those instructions.' }}" - "{{ '\n\n# User Preamble' }}" - "{{ '\n' + system_message }}" - "{{'\n\n## Available Tools\nHere is a list of tools that you have available to you:\n\n'}}" - "{% for tool in tools %}" - "{% if loop.index0 != 0 %}" - "{{ '\n\n'}}" - "{% endif %}" - "{{'```python\ndef ' + tool.name + '('}}" - "{% for param_name, param_fields in tool.parameter_definitions.items() %}" - "{% if loop.index0 != 0 %}" - "{{ ', '}}" - "{% endif %}" - "{{param_name}}: " - "{% if not param_fields.required %}" - "{{'Optional[' + param_fields.type + '] = None'}}" - "{% else %}" - "{{ param_fields.type }}" - "{% endif %}" - "{% endfor %}" - '{{ \') -> List[Dict]:\n """\'}}' - "{{ tool.description }}" - "{% if tool.parameter_definitions|length != 0 %}" - "{{ '\n\n Args:\n '}}" - "{% for param_name, param_fields in tool.parameter_definitions.items() %}" - "{% if loop.index0 != 0 %}" - "{{ '\n ' }}" - "{% endif %}" - "{{ param_name + ' ('}}" - "{% if not param_fields.required %}" - "{{'Optional[' + param_fields.type + ']'}}" - "{% else %}" - "{{ param_fields.type }}" - "{% endif %}" - "{{ '): ' + param_fields.description }}" - "{% endfor %}" - "{% endif %}" - '{{ \'\n """\n pass\n```\' }}' - "{% endfor %}" - "{{ '<|END_OF_TURN_TOKEN|>'}}" - "{% for message in loop_messages %}" - "{% set content = message['content'] %}" - "{% if message['role'] == 'user' %}" - "{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% endif %}" - "{% endfor %}" - "{{'<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write \\'Action:\\' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user\\'s last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:\n```json\n[\n {\n \"tool_name\": title of the tool in the specification,\n \"parameters\": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters\n }\n]```<|END_OF_TURN_TOKEN|>'}}" - "{% if add_generation_prompt %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}" - "{% endif %}" - ) - default_tool_message = DEFAULT_RAG_PREAMBLE.replace("\n", "\\n").replace("'", "\\'") - tool_use_template = tool_use_template.replace("DEFAULT_SYSTEM_MESSAGE", default_tool_message) - - rag_template = ( - "{{ bos_token }}" - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% endif %}" - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}" - "{{ '# Safety Preamble' }}" - "{{ '\nThe instructions in this section override those in the task description and style guide sections. Don\\'t answer questions that are harmful or immoral.' }}" - "{{ '\n\n# System Preamble' }}" - "{{ '\n## Basic Rules' }}" - "{{ '\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user\\'s requests, you cite your sources in your answers, according to those instructions.' }}" - "{{ '\n\n# User Preamble' }}" - "{{ '\n' + system_message }}" - "{{ '<|END_OF_TURN_TOKEN|>'}}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% set content = message['content'] %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% endif %}" - "{% endfor %}" - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>'}}" - "{{ '' }}" - "{% for document in documents %}" # Loop over all non-system messages - "{{ '\nDocument: ' }}" - "{{ loop.index0 }}\n" - "{% for key, value in document.items() %}" - "{{ key }}: {{value}}\n" - "{% endfor %}" - "{% endfor %}" - "{{ ''}}" - "{{ '<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}" - "{{ 'Carefully perform the following instructions, in order, starting each with a new line.\n' }}" - "{{ 'Firstly, Decide which of the retrieved documents are relevant to the user\\'s last input by writing \\'Relevant Documents:\\' followed by comma-separated list of document numbers. If none are relevant, you should instead write \\'None\\'.\n' }}" - "{{ 'Secondly, Decide which of the retrieved documents contain facts that should be cited in a good answer to the user\\'s last input by writing \\'Cited Documents:\\' followed a comma-separated list of document numbers. If you dont want to cite any of them, you should instead write \\'None\\'.\n' }}" - "{% if citation_mode=='accurate' %}" - "{{ 'Thirdly, Write \\'Answer:\\' followed by a response to the user\\'s last input in high quality natural english. Use the retrieved documents to help you. Do not insert any citations or grounding markup.\n' }}" - "{% endif %}" - "{{ 'Finally, Write \\'Grounded answer:\\' followed by a response to the user\\'s last input in high quality natural english. Use the symbols and to indicate when a fact comes from a document in the search result, e.g my fact for a fact from document 0.' }}" - "{{ '<|END_OF_TURN_TOKEN|>' }}" - "{% if add_generation_prompt %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}" - "{% endif %}" - ) - default_rag_message = DEFAULT_RAG_PREAMBLE.replace("\n", "\\n").replace("'", "\\'") - rag_template = rag_template.replace("DEFAULT_SYSTEM_MESSAGE", default_rag_message) - - return {"default": default_template, "tool_use": tool_use_template, "rag": rag_template} - def apply_tool_use_template( self, conversation: Union[List[Dict[str, str]]], diff --git a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py index 51789e49b2d2..782f68bf921e 100644 --- a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py +++ b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py @@ -236,19 +236,6 @@ def convert_tokens_to_string(self, tokens): text = "".join(words) return text - @property - def default_chat_template(self): - """ - A simple chat template that adds standard BOS, SEP and EOS tokens between messages while discarding role - information. - """ - return ( - "{% for message in messages %}" - "{% if not loop.first %}{{ bos_token}}{% endif %}" - "{{ sep_token }}{{ message.content }} {{ eos_token }}" - "{% endfor %}" - ) - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: index = 0 if os.path.isdir(save_directory): diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py index 9bca559d9ea0..badacf6dbe71 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2.py +++ b/src/transformers/models/gpt2/tokenization_gpt2.py @@ -329,10 +329,3 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): if is_split_into_words or add_prefix_space: text = " " + text return (text, kwargs) - - @property - def default_chat_template(self): - """ - A simple chat template that ignores role information and just concatenates messages with EOS tokens. - """ - return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py index e6747119f422..90e83f0d35a3 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py +++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py @@ -139,12 +139,3 @@ def _encode_plus(self, *args, **kwargs) -> BatchEncoding: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) - - @property - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template - def default_chat_template(self): - """ - A simple chat template that ignores role information and just concatenates messages with EOS tokens. - """ - - return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" diff --git a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py index 2504fa3cc051..c79e6d9ada15 100644 --- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py +++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py @@ -228,11 +228,3 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) - - @property - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template - def default_chat_template(self): - """ - A simple chat template that ignores role information and just concatenates messages with EOS tokens. - """ - return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" diff --git a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py index f36f7e3fd610..ea7f3959c78d 100644 --- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py @@ -161,18 +161,6 @@ def convert_tokens_to_string(self, tokens): out_string = "".join(tokens).strip() return out_string - @property - def default_chat_template(self): - """ - A simple chat template that just adds BOS/EOS tokens around messages while discarding role information. - """ - return ( - "{% for message in messages %}" - "{{ bos_token + eos_token + message.content + eos_token }}" - "{% endfor %}" - "{% if add_generation_prompt %} {{ bos_token + eos_token }} {% endif %}" - ) - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: index = 0 if os.path.isdir(save_directory): diff --git a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py index 1000bfd1b6c8..262aeaba5eea 100644 --- a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py +++ b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py @@ -294,19 +294,3 @@ def decode_fast(self, token_ids: Union[int, List[int]]) -> str: """ return self.sp_model.decode(token_ids) - - @property - def default_chat_template(self): - """ - This chat template formats messages like an instant messenger chat log, with "User:" and "Bot:" strings - preceding messages. BOS tokens are added between all messages. - """ - return ( - "{{ eos_token }}{{ bos_token }}" - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}" - "{% else %}{{ 'Bot: ' + message['content']}}{% endif %}" - "{{ message['text'] }}{{ bos_token }}" - "{% endfor %}" - "Bot:" - ) diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index c665ba74d06a..2e14118144ba 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -251,60 +251,3 @@ def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) - - @property - def default_chat_template(self): - """ - This template formats inputs in the form of a chat history. For each message in the chat history: - * the template will output the role of the speaker followed by the content of the message. - * content can be a single string or a list of strings and images. - * If the content element is an image, the template will output a sequence of tokens and token before and after each image - * The template will output an token at the end of each message. - - Example: - - ```python - messages = [{ - "role": "user", - "content": [ - {"type": "text", "text": "What’s in this image?"}, - {"type": "image"}, - {"type": "image"}, - ], - }, - { - "role": "assistant", - "content": [{"type": "text", "text": "This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground."},] - }] - ``` - - Will create outputs like: - ``` - User: What is in this Image? - Assistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground. - ``` - """ - # fmt: off - return ( - "{% for message in messages %}" - "{{message['role'].capitalize()}}" - "{% if message['content'][0]['type'] == 'image' %}" - "{{':'}}" - "{% else %}" - "{{': '}}" - "{% endif %}" - "{% for line in message['content'] %}" - "{% if line['type'] == 'text' %}" - "{{line['text']}}" - "{% elif line['type'] == 'image' %}" - "{{ '' }}" - "{% endif %}" - "{% endfor %}" - "\n" - "{% endfor %}" - - "{% if add_generation_prompt %}" - "{{ 'Assistant:' }}" - "{% endif %}" - ) - # fmt: on diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py index 80865ba98d6d..385ad2d88e10 100644 --- a/src/transformers/models/llama/tokenization_llama.py +++ b/src/transformers/models/llama/tokenization_llama.py @@ -411,57 +411,3 @@ def create_token_type_ids_from_sequences( output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) return output - - @property - def default_chat_template(self): - """ - LLaMA uses [INST] and [/INST] to indicate user messages, and <> and <> to indicate system messages. - Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict - user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering - rather than needing special tokens. The system message is partly 'embedded' in the first user message, which - results in an unusual token ordering when it is present. This template should definitely be changed if you wish - to fine-tune a model with more flexible role ordering! - - The output should look something like: - - [INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer [INST] Prompt [/INST] Answer - [INST] Prompt [/INST] - - The reference for this chat template is [this code - snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) - in the original repository. - """ - template = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}" - "{% set loop_messages = messages %}" # Or use the default system message if the flag is set - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = false %}" - "{% endif %}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message - "{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}" - "{% else %}" - "{% set content = message['content'] %}" - "{% endif %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ ' ' + content.strip() + ' ' + eos_token }}" - "{% endif %}" - "{% endfor %}" - ) - template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false") - default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") - template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) - - return template diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py index 91d3bf361517..67e339b4290a 100644 --- a/src/transformers/models/llama/tokenization_llama_fast.py +++ b/src/transformers/models/llama/tokenization_llama_fast.py @@ -241,61 +241,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return (out_vocab_file,) - @property - # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template - def default_chat_template(self): - """ - LLaMA uses [INST] and [/INST] to indicate user messages, and <> and <> to indicate system messages. - Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict - user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering - rather than needing special tokens. The system message is partly 'embedded' in the first user message, which - results in an unusual token ordering when it is present. This template should definitely be changed if you wish - to fine-tune a model with more flexible role ordering! - - The output should look something like: - - [INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer [INST] Prompt [/INST] Answer - [INST] Prompt [/INST] - - The reference for this chat template is [this code - snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) - in the original repository. - """ - template = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}" - "{% set loop_messages = messages %}" # Or use the default system message if the flag is set - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = false %}" - "{% endif %}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message - "{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}" - "{% else %}" - "{% set content = message['content'] %}" - "{% endif %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ ' ' + content.strip() + ' ' + eos_token }}" - "{% endif %}" - "{% endfor %}" - ) - template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false") - default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") - template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) - - return template - # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index 81426b3a0af3..6b5e86ab4149 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -159,63 +159,3 @@ def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) - - @property - def default_chat_template(self): - """ - This default vicuna template formats inputs in the form of a chat history. For each message in the chat history: - * the template will output the role of the speaker followed by the content of the message. - * content is a list of strings and images. - * If the content element is an image, the template will output a sequence of or