From 26dfb9824f61d74a1083cf6bc240d553a7ec1844 Mon Sep 17 00:00:00 2001
From: huseinzol05 <husein.zol05@gmail.com>
Date: Mon, 25 Nov 2024 15:29:07 +0800
Subject: [PATCH] added compile

---
 session/translation/end-to-end/README.md      |  99 +--
 ...ck-v4.sh => nanot5-base-multipack-post.sh} |   4 +-
 .../nanot5-small-multipack-compile.sh         |  28 +
 .../end-to-end/nanot5-small-multipack-post.sh |  27 +
 .../end-to-end/run_t5_multipack_compile.py    | 599 ++++++++++++++++++
 5 files changed, 672 insertions(+), 85 deletions(-)
 rename session/translation/end-to-end/{nanot5-small-multipack-v4.sh => nanot5-base-multipack-post.sh} (86%)
 create mode 100644 session/translation/end-to-end/nanot5-small-multipack-compile.sh
 create mode 100644 session/translation/end-to-end/nanot5-small-multipack-post.sh
 create mode 100644 session/translation/end-to-end/run_t5_multipack_compile.py

diff --git a/session/translation/end-to-end/README.md b/session/translation/end-to-end/README.md
index c60e93da..3a235dea 100644
--- a/session/translation/end-to-end/README.md
+++ b/session/translation/end-to-end/README.md
@@ -1,90 +1,23 @@
 # HuggingFace T5
 
-## T5
+## how to primeintellect
 
-1. Run prepare dataset, [prepare-data.ipynb](prepare-data.ipynb).
-
-2. Run training script,
-
-Original script, https://github.com/huggingface/transformers/blob/v4.21.2/examples/pytorch/translation/run_translation.py
-
-BASE model,
+```bash
+cd /workspace
+apt update
+apt install vim screen -y
+pip3 install huggingface-hub wandb mosaicml-streaming evaluate
+huggingface-cli login
+wandb wandb login
+pip3 install notebook==6.5.6
+screen -dmS jupyter_session bash -c "jupyter notebook --NotebookApp.token='' --no-browser --allow-root --notebook-dir='/workspace'"
+python -c "
+from huggingface_hub import snapshot_download
+snapshot_download(repo_id='mesolitica/malaysian-translation-v2-multipack-2048-post', repo_type='dataset', local_dir = './malaysian-translation-v2-multipack-2048-post')
+"
+pip3 install git+https://github.com/mesolitica/t5-sdpa-multipack
+bash nanot5-small-multipack-post.sh
 ```
-CUDA_VISIBLE_DEVICES='0' \
-WANDB_DISABLED=true \
-python3 run_t5.py \
---model_name_or_path mesolitica/t5-base-standard-bahasa-cased \
---num_train_epochs 5 \
---logging_steps 100 \
---eval_steps 10000 \
---save_steps 10000 \
---evaluation_strategy steps \
---save_total_limit 3 \
---do_train \
---do_eval \
---source_lang src \
---target_lang tgt \
---train_file shuffled-train.json \
---validation_file test.json \
---output_dir finetune-t5-base-standard-bahasa-cased \
---per_device_train_batch_size=8 \
---per_device_eval_batch_size=4 \
---predict_with_generate \
---max_source_length 1536 \
---max_target_length 1536 \
---learning_rate 2e-4 \
---gradient_checkpointing true
-```
-
-SMALL model,
-```
-WANDB_PROJECT="translation-t5-small" \
-torchrun \
---nproc_per_node 2 \
--m run_t5 \
---model_name_or_path mesolitica/t5-small-standard-bahasa-cased \
---num_train_epochs 3 \
---eval_steps 1000000000 \
---logging_steps 10 \
---save_steps 2000 \
---save_total_limit 3 \
---do_train \
---train_file mosaic \
---output_dir finetune-t5-small-standard-bahasa-cased \
---per_device_train_batch_size=16 \
---per_device_eval_batch_size=4 \
---max_source_length 1536 \
---max_target_length 1536 \
---learning_rate 2e-4 \
---gradient_checkpointing true \
---bf16
-```
-
-TINY model,
-```
-WANDB_PROJECT="translation-t5-small" \
-torchrun \
---nproc_per_node 2 \
--m run_t5 \
---model_name_or_path mesolitica/t5-tiny-standard-bahasa-cased \
---num_train_epochs 3 \
---eval_steps 1000000000 \
---logging_steps 10 \
---save_steps 2000 \
---save_total_limit 3 \
---do_train \
---train_file mosaic \
---output_dir finetune-t5-tiny-standard-bahasa-cased \
---per_device_train_batch_size=16 \
---per_device_eval_batch_size=4 \
---max_source_length 1536 \
---max_target_length 1536 \
---learning_rate 5e-5 \
---gradient_checkpointing true \
---bf16
-```
-
-## nanoT5
 
 1. Run prepare dataset, [prepare-data-v2.ipynb](prepare-data-v2.ipynb).
 
diff --git a/session/translation/end-to-end/nanot5-small-multipack-v4.sh b/session/translation/end-to-end/nanot5-base-multipack-post.sh
similarity index 86%
rename from session/translation/end-to-end/nanot5-small-multipack-v4.sh
rename to session/translation/end-to-end/nanot5-base-multipack-post.sh
index bd3dc95d..97955ed9 100644
--- a/session/translation/end-to-end/nanot5-small-multipack-v4.sh
+++ b/session/translation/end-to-end/nanot5-base-multipack-post.sh
@@ -9,7 +9,7 @@ torchrun \
 --save_steps 200 \
 --save_total_limit 3 \
 --do_train \
---train_file malaysian-translation-v2-multipack-2248-post \
+--train_file malaysian-translation-v2-multipack-2048-post \
 --output_dir nanot5-small-malaysian-cased-translation-v4-packing-post-v3 \
 --dataloader_num_workers=5 \
 --per_device_train_batch_size=1 \
@@ -23,4 +23,4 @@ torchrun \
 --weight_decay 0.001 \
 --bf16 \
 --ddp_find_unused_parameters true \
---model_revision d3d615448b7153d849116cf028a1290f86d7f744
+--model_revision 57cebd22c04e2c3ffe4697a1e9dbfa5839e8750c
diff --git a/session/translation/end-to-end/nanot5-small-multipack-compile.sh b/session/translation/end-to-end/nanot5-small-multipack-compile.sh
new file mode 100644
index 00000000..b23b78ce
--- /dev/null
+++ b/session/translation/end-to-end/nanot5-small-multipack-compile.sh
@@ -0,0 +1,28 @@
+WANDB_PROJECT="nanot5-small-malaysian-cased-translation-v6-multipack-post" \
+torchrun \
+--nproc_per_node 2 \
+-m run_t5_multipack_compile \
+--model_name_or_path mesolitica/nanot5-small-malaysian-translation-v2 \
+--num_train_epochs 2 \
+--eval_steps 1000000000 \
+--logging_steps 2 \
+--save_steps 200 \
+--save_total_limit 3 \
+--do_train \
+--train_file /home/husein/ssd3/t5-sdpa-multipack/packing-post \
+--output_dir nanot5-small-malaysian-cased-translation-v5-multipack-post \
+--dataloader_num_workers 5 \
+--dataloader_prefetch_factor 4 \
+--per_device_train_batch_size=2 \
+--per_device_eval_batch_size=3 \
+--gradient_accumulation_steps=8 \
+--max_source_length 2048 \
+--max_target_length 2048 \
+--learning_rate 2e-5 \
+--max_grad_norm 1.0 \
+--gradient_checkpointing false \
+--weight_decay 0.001 \
+--bf16 \
+--ddp_find_unused_parameters true \
+--model_revision d3d615448b7153d849116cf028a1290f86d7f744 \
+--torch_compile
diff --git a/session/translation/end-to-end/nanot5-small-multipack-post.sh b/session/translation/end-to-end/nanot5-small-multipack-post.sh
new file mode 100644
index 00000000..28cce9b1
--- /dev/null
+++ b/session/translation/end-to-end/nanot5-small-multipack-post.sh
@@ -0,0 +1,27 @@
+WANDB_PROJECT="nanot5-small-malaysian-cased-translation-v6-multipack-post" \
+torchrun \
+--nproc_per_node 2 \
+-m run_t5_multipack \
+--model_name_or_path mesolitica/nanot5-small-malaysian-translation-v2 \
+--num_train_epochs 2 \
+--eval_steps 1000000000 \
+--logging_steps 2 \
+--save_steps 200 \
+--save_total_limit 3 \
+--do_train \
+--train_file /home/husein/ssd3/t5-sdpa-multipack/packing-post \
+--output_dir nanot5-small-malaysian-cased-translation-v5-multipack-post \
+--dataloader_num_workers 5 \
+--dataloader_prefetch_factor 4 \
+--per_device_train_batch_size=4 \
+--per_device_eval_batch_size=3 \
+--gradient_accumulation_steps=6 \
+--max_source_length 2048 \
+--max_target_length 2048 \
+--learning_rate 2e-5 \
+--max_grad_norm 1.0 \
+--gradient_checkpointing false \
+--weight_decay 0.001 \
+--bf16 \
+--ddp_find_unused_parameters true \
+--model_revision d3d615448b7153d849116cf028a1290f86d7f744
diff --git a/session/translation/end-to-end/run_t5_multipack_compile.py b/session/translation/end-to-end/run_t5_multipack_compile.py
new file mode 100644
index 00000000..55be64d3
--- /dev/null
+++ b/session/translation/end-to-end/run_t5_multipack_compile.py
@@ -0,0 +1,599 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for sequence to sequence.
+"""
+# You can also adapt this script on your own sequence to sequence task.
+# Pointers for this are left as comments.
+
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional, Any, Union
+import json
+import torch
+import torch.nn.functional as F
+import math
+import datasets
+import numpy as np
+from datasets import load_dataset
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    DataCollatorForSeq2Seq,
+    HfArgumentParser,
+    M2M100Tokenizer,
+    MBart50Tokenizer,
+    MBart50TokenizerFast,
+    MBartTokenizer,
+    MBartTokenizerFast,
+    Seq2SeqTrainer,
+    Seq2SeqTrainingArguments,
+    default_data_collator,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+from tokenizers import AddedToken
+from transformers.data.data_collator import pad_without_fast_tokenizer_warning
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from transformers.utils import PaddingStrategy
+from streaming import LocalDataset
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+# check_min_version("4.23.0.dev0")
+
+require_version(
+    "datasets>=1.8.0",
+    "To fix: pip install -r examples/pytorch/translation/requirements.txt")
+
+logger = logging.getLogger(__name__)
+
+# A list of all multilingual tokenizer which require src_lang and tgt_lang attributes.
+MULTILINGUAL_TOKENIZERS = [
+    MBartTokenizer,
+    MBartTokenizerFast,
+    MBart50Tokenizer,
+    MBart50TokenizerFast,
+    M2M100Tokenizer]
+
+def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+    """
+    Adapted from Mesh Tensorflow:
+    https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+    Translate relative position to a bucket number for relative attention. The relative position is defined as
+    memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+    position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+    small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+    positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+    This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+    Args:
+        relative_position: an int32 Tensor
+        bidirectional: a boolean - whether the attention is bidirectional
+        num_buckets: an integer
+        max_distance: an integer
+
+    Returns:
+        a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+    """
+    relative_buckets = 0
+    if bidirectional:
+        num_buckets //= 2
+        relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+        relative_position = torch.abs(relative_position)
+    else:
+        relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+    # now relative_position is in the range [0, inf)
+
+    # half of the buckets are for exact increments in positions
+    max_exact = num_buckets // 2
+    is_small = relative_position < max_exact
+
+    # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+    relative_position_if_large = max_exact + (
+        torch.log(relative_position.float() / max_exact)
+        / math.log(max_distance / max_exact)
+        * (num_buckets - max_exact)
+    ).to(torch.long)
+    relative_position_if_large = torch.min(
+        relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+    )
+
+    relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
+    return relative_buckets
+
+def compute_bias(
+    query_length, 
+    key_length,
+    relative_attention_bias,
+    bidirectional = True, 
+    num_buckets = 32, 
+    max_distance = 128, 
+    device=None,
+):
+    """Compute binned relative position bias"""
+    if device is None:
+        device = relative_attention_bias.weight.device
+    context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
+    memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
+    relative_position = memory_position - context_position  # shape (query_length, key_length)
+    relative_position_bucket = _relative_position_bucket(
+        relative_position,  # shape (query_length, key_length)
+        bidirectional=bidirectional,
+        num_buckets=num_buckets,
+        max_distance=max_distance,
+    )
+    values = relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
+    values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+    return values
+
+def block_diagonal_concat_inverted(*masks, dtype=torch.bfloat16):
+    total_size = sum(mask.size(0) for mask in masks)
+    combined_mask = torch.zeros(total_size, total_size, dtype=dtype)
+
+    current_pos = 0
+
+    for mask in masks:
+        size = mask.size(0)
+        combined_mask[current_pos:current_pos + size, current_pos:current_pos + size] = mask
+        current_pos += size
+
+    min_value = torch.finfo(dtype).min if dtype.is_floating_point else torch.iinfo(dtype).min
+    inverted_mask = torch.where(combined_mask == 1, torch.tensor(0, dtype=dtype), min_value)
+    return inverted_mask.unsqueeze(0)
+
+def pad_attention_mask(attention_mask, maxlen = 2048):
+    maxlen_right = maxlen
+    maxlen_bottom = maxlen
+    attention_mask = [
+        F.pad(
+            attention_mask[i],
+            (0, maxlen_right - attention_mask[i].shape[1], 0, maxlen_bottom - attention_mask[i].shape[0])) for i in range(
+            len(attention_mask))]
+    return torch.stack(attention_mask)
+
+def pad_attention_mask_4d(attention_mask, maxlen = 2048):
+    maxlen_right = maxlen
+    maxlen_bottom = maxlen
+    attention_mask = [
+        F.pad(
+            attention_mask[i],
+            (0, maxlen_right - attention_mask[i].shape[-2], 0, maxlen_bottom - attention_mask[i].shape[-1])) for i in range(
+            len(attention_mask))]
+    return torch.stack(attention_mask)
+
+def block_diagonal_concat(*masks, dtype=torch.bfloat16):
+    total_size = sum(mask.size(0) for mask in masks)
+    combined_mask = torch.zeros(total_size, total_size, dtype=dtype)
+
+    current_pos = 0
+
+    for mask in masks:
+        size = mask.size(0)
+        combined_mask[current_pos:current_pos + size, current_pos:current_pos + size] = mask
+        current_pos += size
+
+    return combined_mask
+
+def block_diagonal_concat_4d(*masks, dtype=torch.bfloat16):
+    total_size = sum(mask.size(1) for mask in masks)
+    combined_mask = torch.zeros(masks[0].shape[0], 
+                                total_size, total_size, dtype=dtype)
+
+    current_pos = 0
+
+    for mask in masks:
+        size = mask.size(1)
+        combined_mask[:, current_pos:current_pos + size, current_pos:current_pos + size] = mask
+        current_pos += size
+
+    return combined_mask
+
+def block_diagonal_concat_cross(*masks, dtype=torch.bfloat16):
+    total_rows = sum(mask.size(0) for mask in masks)
+    total_cols = sum(mask.size(1) for mask in masks)
+    
+    combined_mask = torch.zeros((total_rows, total_cols), dtype=dtype)
+    
+    current_row, current_col = 0, 0
+
+    for mask in masks:
+        rows, cols = mask.size()
+        combined_mask[current_row:current_row + rows, current_col:current_col + cols] = mask
+        current_row += rows
+        current_col += cols
+        
+    return combined_mask
+
+def collator(batch, pad_token_id = 1, label_pad = -100, maxlen = 2048):
+    max_length = maxlen
+    results = {}
+    results['input_ids'] = [
+        b['input_ids'] + [pad_token_id] * (max_length - len(b['input_ids']))
+        for b in batch
+    ]
+    results['input_ids'] = torch.tensor(results['input_ids'], dtype = torch.int64)
+    
+    max_length = maxlen
+    results['labels'] = [
+        b['labels'] + [label_pad] * (max_length - len(b['labels']))
+        for b in batch
+    ]
+    results['labels'] = torch.tensor(results['labels'], dtype = torch.int64)
+    
+    results['position_bias'] = pad_attention_mask_4d([b['position_bias'] for b in batch])
+    results['decoder_position_bias'] = pad_attention_mask_4d([b['decoder_position_bias'] for b in batch])
+    
+    attention_mask = [b['attention_mask'] for b in batch]
+    results['attention_mask'] = pad_attention_mask(attention_mask)
+    encoder_attention_mask = [b['encoder_attention_mask'] for b in batch]
+    results['encoder_attention_mask'] = pad_attention_mask(encoder_attention_mask)
+    decoder_attention_mask = [b['decoder_attention_mask'] for b in batch]
+    results['decoder_attention_mask'] = pad_attention_mask(decoder_attention_mask)
+    
+    dtype = results['attention_mask'].dtype
+    encoder_extended_attention_mask = results['attention_mask'][:, None, :, :]
+    encoder_extended_attention_mask = encoder_extended_attention_mask
+    encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(dtype).min
+    results['position_bias'] = results['position_bias'] + encoder_extended_attention_mask
+    
+    dtype = results['decoder_attention_mask'].dtype
+    encoder_extended_attention_mask = results['decoder_attention_mask'][:, None, :, :]
+    encoder_extended_attention_mask = encoder_extended_attention_mask
+    encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(dtype).min
+    results['decoder_position_bias'] = results['decoder_position_bias'] + encoder_extended_attention_mask
+    
+    return results
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={
+            "help": "Pretrained config name or path if not the same as model_name"})
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={
+            "help": "Pretrained tokenizer name or path if not the same as model_name"})
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True, metadata={
+            "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, )
+    model_revision: str = field(
+        default="main", metadata={
+            "help": "The specific model version to use (can be a branch name, tag name or commit id)."}, )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
+                "with private models).")},
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    source_lang: str = field(default=None, metadata={"help": "Source language id for translation."})
+    target_lang: str = field(default=None, metadata={"help": "Target language id for translation."})
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={
+            "help": "The name of the dataset to use (via the datasets library)."})
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={
+            "help": "The configuration name of the dataset to use (via the datasets library)."})
+    train_file: Optional[str] = field(
+        default=None, metadata={
+            "help": "The input training data file (a jsonlines)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input evaluation data file to evaluate the metrics (sacrebleu) on a jsonlines file."
+        },
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "An optional input test data file to evaluate the metrics (sacrebleu) on a jsonlines file."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_source_length: Optional[int] = field(
+        default=1024,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    max_target_length: Optional[int] = field(
+        default=128, metadata={
+            "help": (
+                "The maximum total sequence length for target text after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded.")}, )
+    val_max_target_length: Optional[int] = field(
+        default=None, metadata={
+            "help": (
+                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
+                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
+                "during ``evaluate`` and ``predict``.")}, )
+    pad_to_max_length: bool = field(
+        default=False, metadata={
+            "help": (
+                "Whether to pad all samples to model maximum sentence length. "
+                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
+                "efficient on GPU but very bad for TPU.")}, )
+    max_train_samples: Optional[int] = field(
+        default=None, metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set.")}, )
+    max_eval_samples: Optional[int] = field(
+        default=None, metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+                "value if set.")}, )
+    max_predict_samples: Optional[int] = field(
+        default=None, metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+                "value if set.")}, )
+    num_beams: Optional[int] = field(
+        default=None, metadata={
+            "help": (
+                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
+                "which is used during ``evaluate`` and ``predict``.")}, )
+    ignore_pad_token_for_loss: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
+        },
+    )
+    source_prefix: Optional[str] = field(
+        default=None, metadata={
+            "help": "A prefix to add before every source text (useful for T5 models)."})
+    forced_bos_token: Optional[str] = field(
+        default=None, metadata={
+            "help": (
+                "The token to force as the first generated token after the :obj:`decoder_start_token_id`.Useful for"
+                " multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token needs to"
+                " be the target language token.(Usually it is the target language token)")}, )
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(
+            json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    print(training_args)
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_translation", model_args, data_args)
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" +
+        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}")
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    if data_args.source_prefix is None and model_args.model_name_or_path in [
+        "t5-small",
+        "t5-base",
+        "t5-large",
+        "t5-3b",
+        "t5-11b",
+    ]:
+        logger.warning(
+            "You're running a t5 model but didn't provide a source prefix, which is expected, e.g. with "
+            "`--source_prefix 'translate English to German: ' `")
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(
+            training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    model = AutoModelForSeq2SeqLM.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        attn_implementation = 'sdpa',
+    )
+    print(model)
+    model.encoder.block[0].layer[0].SelfAttention.relative_attention_bias.weight._requires_grad = False
+    model.decoder.block[0].layer[0].SelfAttention.relative_attention_bias.weight._requires_grad = False
+    model.config.use_cache = False
+    encoder_emb = torch.nn.Embedding(
+        model.config.relative_attention_num_buckets, 
+        model.config.num_heads
+    )
+
+    with torch.no_grad():
+        encoder_emb.weight.copy_(model.encoder.block[0].layer[0].SelfAttention.relative_attention_bias.weight.detach())
+
+    encoder_emb.weight.requires_grad = False
+
+    decoder_emb = torch.nn.Embedding(
+        model.config.relative_attention_num_buckets, 
+        model.config.num_heads
+    )
+
+    with torch.no_grad():
+        decoder_emb.weight.copy_(model.decoder.block[0].layer[0].SelfAttention.relative_attention_bias.weight.detach())
+
+    decoder_emb.weight.requires_grad = False
+
+    def multipack(input_ids, labels, lengths):
+        results = {
+            'input_ids': input_ids,
+            'labels': labels,
+        }
+        attention_mask = []
+        encoder_attention_mask = []
+        decoder_attention_mask = []
+        encoder_biases = []
+        decoder_biases = []
+        
+        for length in lengths:
+            left_len = length[0]
+            right_len = length[1]
+            
+            attention_mask.append(torch.ones(left_len, left_len))
+            encoder_attention_mask.append(torch.ones(right_len, left_len))
+            decoder_attention_mask.append(torch.tril(torch.ones(right_len, right_len)))
+            
+            encoder_bias = compute_bias(
+                left_len, left_len,
+                encoder_emb,
+                bidirectional=True,
+                num_buckets=model.config.relative_attention_num_buckets,
+                max_distance=model.config.relative_attention_max_distance,
+            )
+            encoder_biases.append(encoder_bias[0])
+            
+            decoder_bias = compute_bias(
+                right_len, right_len,
+                decoder_emb,
+                bidirectional=False,
+                num_buckets=model.config.relative_attention_num_buckets,
+                max_distance=model.config.relative_attention_max_distance,
+            )
+            decoder_biases.append(decoder_bias[0])
+            
+        results['attention_mask'] = block_diagonal_concat(*attention_mask)
+        results['encoder_attention_mask'] = block_diagonal_concat_cross(*encoder_attention_mask)
+        results['decoder_attention_mask'] = block_diagonal_concat(*decoder_attention_mask)
+        
+        results['position_bias'] = block_diagonal_concat_4d(*encoder_biases)
+        results['decoder_position_bias'] = block_diagonal_concat_4d(*decoder_biases)
+        
+        return results
+
+    class DatasetFixed(torch.utils.data.Dataset):
+        def __init__(self, remote):
+
+            self.dataset = LocalDataset(local=remote)
+
+        def __getitem__(self, idx):
+            row = self.dataset[idx]
+            d = json.loads(row['data'])
+            return multipack(**d)
+
+        def __len__(self):
+            return len(self.dataset)
+
+    train_dataset = DatasetFixed(data_args.train_file)
+
+    # Initialize our Trainer
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=None,
+        tokenizer=tokenizer,
+        data_collator=collator,
+        compute_metrics=None,
+    )
+
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()