From 4f74d7220a4f5d7d079001e1876d778a35321702 Mon Sep 17 00:00:00 2001
From: bmullick-amd <BAISHALI.Mullick@amd.com>
Date: Tue, 17 Dec 2024 18:26:17 -0800
Subject: [PATCH] fixed github actions

---
 vllm/model_executor/models/t5.py | 443 +++++++++----------------------
 1 file changed, 123 insertions(+), 320 deletions(-)

diff --git a/vllm/model_executor/models/t5.py b/vllm/model_executor/models/t5.py
index 019827aeeb1b7..b3ef11a441546 100644
--- a/vllm/model_executor/models/t5.py
+++ b/vllm/model_executor/models/t5.py
@@ -128,123 +128,76 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
                 if hasattr(pointer, "layer_norm"):
                     pointer = getattr(pointer, "layer_norm", None)
                 elif hasattr(pointer, "final_layer_norm"):
-                    pointer = getattr(pointer, "final_layer_norm")
+                    pointer = getattr(pointer, "final_layer_norm", None)
             elif scope_names[0] == "scale":
-                pointer = getattr(pointer, "weight")
+                pointer = getattr(pointer, "weight", None)
             elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, "bias")
+                pointer = getattr(pointer, "bias", None)
             elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
+                pointer = getattr(pointer, "classifier", None)
             elif scope_names[0] == "decoder" and name[1] == "logits":
                 continue
             elif scope_names[0] == "logits":
-                pointer = getattr(pointer, "lm_head")
-            elif scope_names[0] == "wi" and len(scope_names) > 1 and scope_names[1].isdigit():
+                pointer = getattr(pointer, "lm_head", None)
+            elif scope_names[0] == "wi" and 
+              len(scope_names) > 1 and 
+              scope_names[1].isdigit():
                 pointer = getattr(pointer, f"wi_{scope_names[1]}")
                 continue
             else:
                 try:
                     pointer = getattr(pointer, scope_names[0])
                 except AttributeError:
-                    logger.info(f"Skipping {'/'.join(name)}")
+                  log_name = '/'.join(name)
+                    logger.info("Skipping %s", log_name)
                     continue
             if len(scope_names) >= 2:
                 num = int(scope_names[1])
                 pointer = pointer[num]
         if scope_names[0] not in ["kernel", "scale", "embedding"]:
-            pointer = getattr(pointer, "weight")
+            pointer = getattr(pointer, "weight", None)
         if scope_names[0] != "embedding":
-            logger.info(f"Transposing numpy weight of shape {array.shape} for {name}")
+            logger.info("Transpose weight of 
+              shape %s for %s", str(array.shape), name)
             array = np.transpose(array)
         try:
             if pointer.shape != array.shape:
-                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+                raise ValueError("Pointer and array shape mismatched")
         except AssertionError as e:
             e.args += (pointer.shape, array.shape)
             raise
-        logger.info(f"Initialize PyTorch weight {name}")
+        logger.info("Initialize PyTorch weight %s", name)
         pointer.data = torch.from_numpy(array.astype(np.float32))
         tf_weights.pop(txt_name, None)
-
-    logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.")
+    weight_not_copied = ', '.join(tf_weights.keys())
+    logger.info("Weights not copied to PyTorch 
+        model: %s.", weight_not_copied)
     return model
 
-
-####################################################
-# PyTorch Models are constructed by sub-classing
-# - torch.nn.Module for the layers and
-# - PreTrainedModel for the models (it-self a sub-class of nn.Module)
-####################################################
-PARALLELIZE_DOCSTRING = r"""
-    This is an experimental feature and is a subject to change at a moment's notice.
-
-    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
-    it will evenly distribute blocks across all devices.
-
-    Args:
-        device_map (`Dict[int, list]`, optional, defaults to None):
-            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
-            automatically mapped to the first device (for esoteric reasons). That means that the first device should
-            have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
-            following number of attention modules:
-
-                - t5-small: 6
-                - t5-base: 12
-                - t5-large: 24
-                - t5-3b: 24
-                - t5-11b: 24
-
-    Example:
-
-    ```python
-    # Here is an example of a device map on a machine with 4 GPUs using t5-3b, which has a total of 24 attention modules:
-    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
-    device_map = {
-        0: [0, 1, 2],
-        1: [3, 4, 5, 6, 7, 8, 9],
-        2: [10, 11, 12, 13, 14, 15, 16],
-        3: [17, 18, 19, 20, 21, 22, 23],
-    }
-    model.parallelize(device_map)
-    ```
-"""
-DEPARALLELIZE_DOCSTRING = r"""
-    Moves the model to cpu from a model parallel state.
-
-    Example:
-
-    ```python
-    # On a 4 GPU machine with t5-3b:
-    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
-    device_map = {
-        0: [0, 1, 2],
-        1: [3, 4, 5, 6, 7, 8, 9],
-        2: [10, 11, 12, 13, 14, 15, 16],
-        3: [17, 18, 19, 20, 21, 22, 23],
-    }
-    model.parallelize(device_map)  # Splits the model across several devices
-    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
-    ```
-"""
-
-
 class T5LayerNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
-        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        Construct a layernorm module in the T5 style. 
+        No bias and no subtraction of mean.
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
         self.variance_epsilon = eps
 
     def forward(self, hidden_states):
-        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
-        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
-        # half-precision inputs is done in fp32
+        """
+        T5 uses a layer_norm which only scales and doesn't 
+        shift, which is also known as Root Mean
+        Square Layer Normalization https://arxiv.org/abs/1910.07467
+        thus variance is calculated
+        w/o mean and there is no bias. Additionally we want to 
+        make sure that the accumulation for half-precision 
+        inputs is done in fp32
+        """
 
         variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        adj_var = variance + self.variance_epsilon
+        hidden_states = hidden_states * torch.rsqrt(adj_var)
 
         # convert into half-precision if necessary
         if self.weight.dtype in [torch.float16, torch.bfloat16]:
@@ -254,10 +207,13 @@ def forward(self, hidden_states):
 
 
 class T5DenseActDense(nn.Module):
-    def __init__(self, config: T5Config, quant_config: Optional[QuantizationConfig] = None):
+    def __init__(self, config: T5Config, 
+      quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
-        self.wi = ColumnParallelLinear(config.d_model, config.d_ff, bias=False, quant_config=quant_config)
-        self.wo = RowParallelLinear(config.d_ff, config.d_model, bias=False, quant_config=quant_config)
+        self.wi = ColumnParallelLinear(config.d_model, 
+          config.d_ff, bias=False, quant_config=quant_config)
+        self.wo = RowParallelLinear(config.d_ff, config.d_model, 
+          bias=False, quant_config=quant_config)
         self.dropout = nn.Dropout(config.dropout_rate)
         self.act = get_act_fn(config.dense_act_fn, quant_config)
 
@@ -277,11 +233,15 @@ def forward(self, hidden_states):
 
 
 class T5DenseGatedActDense(nn.Module):
-    def __init__(self, config: T5Config, quant_config: Optional[QuantizationConfig] = None):
+    def __init__(self, config: T5Config, 
+      quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
-        self.wi_0 = ColumnParallelLinear(config.d_model, config.d_ff, bias=False, quant_config=quant_config)
-        self.wi_1 = ColumnParallelLinear(config.d_model, config.d_ff, bias=False, quant_config=quant_config)
-        self.wo = RowParallelLinear(config.d_ff, config.d_model, bias=False, quant_config=quant_config)
+        self.wi_0 = ColumnParallelLinear(config.d_model, 
+          config.d_ff, bias=False, quant_config=quant_config)
+        self.wi_1 = ColumnParallelLinear(config.d_model, 
+          config.d_ff, bias=False, quant_config=quant_config)
+        self.wo = RowParallelLinear(config.d_ff, config.d_model, 
+          bias=False, quant_config=quant_config)
         self.dropout = nn.Dropout(config.dropout_rate)
         self.act = get_act_fn(config.dense_act_fn, quant_config)
 
@@ -291,9 +251,11 @@ def forward(self, hidden_states):
         hidden_states = hidden_gelu * hidden_linear
         hidden_states = self.dropout(hidden_states)
 
-        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
+        # To make 8bit quantization work for google/flan-t5-xxl, 
+        # self.wo is kept in float32.
         # See https://github.com/huggingface/transformers/issues/20287
-        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        # we also make sure the weights are not in `int8` in case users
+        # will force `_keep_in_fp32_modules` to be `None``
         if (
             isinstance(self.wo.weight, torch.Tensor)
             and hidden_states.dtype != self.wo.weight.dtype
@@ -306,14 +268,16 @@ def forward(self, hidden_states):
 
 
 class T5LayerFF(nn.Module):
-    def __init__(self, config: T5Config, quant_config: Optional[QuantizationConfig] = None):
+    def __init__(self, config: T5Config, 
+      quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
         if config.is_gated_act:
             self.DenseReluDense = T5DenseGatedActDense(config, quant_config)
         else:
             self.DenseReluDense = T5DenseActDense(config, quant_config)
 
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.layer_norm = T5LayerNorm(config.d_model, 
+          eps=config.layer_norm_epsilon)
         self.dropout = nn.Dropout(config.dropout_rate)
 
     def forward(self, hidden_states):
@@ -323,12 +287,17 @@ def forward(self, hidden_states):
         return hidden_states
 
 class T5Attention(nn.Module):
-    def __init__(self, config: T5Config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, has_relative_attention_bias=False):
+    def __init__(self, config: T5Config, 
+      cache_config: Optional[CacheConfig] = None, 
+      quant_config: Optional[QuantizationConfig] = None, 
+      has_relative_attention_bias=False):
         super().__init__()
         self.is_decoder = config.is_decoder
         self.has_relative_attention_bias = has_relative_attention_bias
-        self.relative_attention_num_buckets = config.relative_attention_num_buckets
-        self.relative_attention_max_distance = config.relative_attention_max_distance
+        rel_num_bucket = config.relative_attention_num_buckets
+        rel_max_dist = config.relative_attention_max_distance
+        self.relative_attention_num_buckets = rel_num_bucket
+        self.relative_attention_max_distance = rel_max_dist
         self.d_model = config.d_model
         self.key_value_proj_dim = config.d_kv
         self.n_heads = config.num_heads
@@ -364,7 +333,9 @@ def forward(
     ) -> torch.Tensor:
 
         """
-        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        Self-attention (if key_value_states is None) or 
+        attention over source sentence (provided by 
+        key_value_states).
         """
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([2048, 2048, 2048], dim=-1)
@@ -385,10 +356,16 @@ def forward(
         return output, present_key_value_state
 
 class T5LayerSelfAttention(nn.Module):
-    def __init__(self, config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, has_relative_attention_bias=False):
+    def __init__(self, config, 
+      cache_config: Optional[CacheConfig] = None, 
+      quant_config: Optional[QuantizationConfig] = None, 
+      has_relative_attention_bias=False):
         super().__init__()
-        self.SelfAttention = T5Attention(config, cache_config, quant_config, has_relative_attention_bias=has_relative_attention_bias)
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.SelfAttention = T5Attention(config, cache_config, 
+          quant_config, 
+          has_relative_attention_bias=has_relative_attention_bias)
+        self.layer_norm = T5LayerNorm(config.d_model, 
+          eps=config.layer_norm_epsilon)
         self.dropout = nn.Dropout(config.dropout_rate)
 
     def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
@@ -399,15 +376,19 @@ def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
             kv_cache,
             attn_metadata)
         hidden_states = hidden_states + self.dropout(attention_output[0])
-        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
+        outputs = (hidden_states,) + attention_output[1:]
         return outputs
 
 
 class T5LayerCrossAttention(nn.Module):
-    def __init__(self, config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None):
+    def __init__(self, config, 
+      cache_config: Optional[CacheConfig] = None, 
+      quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
-        self.EncDecAttention = T5Attention(config, cache_config, quant_config, has_relative_attention_bias=False)
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.EncDecAttention = T5Attention(config, cache_config, 
+          quant_config, has_relative_attention_bias=False)
+        self.layer_norm = T5LayerNorm(config.d_model, 
+          eps=config.layer_norm_epsilon)
         self.dropout = nn.Dropout(config.dropout_rate)
 
     def forward(self, 
@@ -424,17 +405,23 @@ def forward(self,
             encoder_hidden_states,
         )
         layer_output = hidden_states + self.dropout(attention_output[0])
-        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        outputs = (layer_output,) + attention_output[1:]
         return outputs
 
 
 class T5Block(nn.Module):
-    def __init__(self, config: T5Config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, has_relative_attention_bias=False):
+    def __init__(self, config: T5Config, 
+      cache_config: Optional[CacheConfig] = None, 
+      quant_config: Optional[QuantizationConfig] = None, 
+      has_relative_attention_bias=False):
         super().__init__()
         self.is_decoder = config.is_decoder
-        self.self_attn = T5LayerSelfAttention(config, cache_config, quant_config, has_relative_attention_bias=has_relative_attention_bias)
+        self.self_attn = T5LayerSelfAttention(config, cache_config, 
+          quant_config, 
+          has_relative_attention_bias=has_relative_attention_bias)
         if self.is_decoder:
-            self.cross_attn = T5LayerCrossAttention(config, cache_config, quant_config)
+            self.cross_attn = T5LayerCrossAttention(config, 
+              cache_config, quant_config)
         self.fc = T5LayerFF(config, quant_config)
         
 
@@ -445,9 +432,10 @@ def forward(
         attn_metadata: AttentionMetadata,
         encoder_hidden_states: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        self_attention_outputs = self.self_attn(hidden_states, kv_cache, attn_metadata)
+        self_attention_outputs = self.self_attn(hidden_states, 
+          kv_cache, attn_metadata)
         hidden, _ = self_attention_outputs[:2]
-        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+        attention_outputs = self_attention_outputs[2:]
 
         # clamp inf values to enable fp16 training
         if hidden.dtype == torch.float16:
@@ -458,9 +446,11 @@ def forward(
             )
             hidden = torch.clamp(hidden, min=-clamp_value, max=clamp_value)
 
-        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+        do_cross_attention = (self.is_decoder and 
+          encoder_hidden_states is not None)
         if do_cross_attention:
-            cross_attention_outputs = self.cross_attn(hidden, kv_cache, attn_metadata, encoder_hidden_states)
+            cross_attention_outputs = self.cross_attn(hidden, kv_cache, 
+              attn_metadata, encoder_hidden_states)
             hidden = cross_attention_outputs[0]
 
             # clamp inf values to enable fp16 training
@@ -519,16 +509,20 @@ def __init__(self,
         self.is_decoder = config.is_decoder
 
         self.block = nn.ModuleList(
-            [T5Block(config, cache_config, quant_config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+            [T5Block(config, cache_config, quant_config, 
+              has_relative_attention_bias=bool(i == 0)) 
+                for i in range(config.num_layers)]
         )
-        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.final_layer_norm = T5LayerNorm(config.d_model, 
+          eps=config.layer_norm_epsilon)
         self.dropout = nn.Dropout(config.dropout_rate)
 
     def forward(self, input_ids: torch.Tensor,
                 positions: torch.Tensor,
                 kv_caches: List[torch.Tensor],
                 attn_metadata: AttentionMetadata,
-                encoder_hidden_states: Optional[torch.Tensor]=None) -> torch.Tensor:
+                encoder_hidden_states: Optional[torch.Tensor]=None
+                ) -> torch.Tensor:
         inputs_embeds = self.embed_tokens(input_ids)
         hidden_states = self.dropout(inputs_embeds)
         # print('t5 stack', type(hidden_states))
@@ -543,171 +537,14 @@ def forward(self, input_ids: torch.Tensor,
         hidden_states = self.dropout(hidden_states)
         return hidden_states
 
-
-T5_START_DOCSTRING = r"""
-
-    The T5 model was proposed in [Exploring the Limits of Transfer Learning with a Unified Text-to-Text
-    Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan
-    Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a
-    text-to-text denoising generative setting.
-
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`T5Config`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-T5_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
-            should be able to pad the inputs on both the right and the left.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for detail.
-
-            [What are input IDs?](../glossary#input-ids)
-
-            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
-        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Indices of decoder input sequence tokens in the vocabulary.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are decoder input IDs?](../glossary#decoder-input-ids)
-
-            T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
-            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
-
-            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
-            Training](./t5#training).
-        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
-            be used by default.
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
-            1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
-            1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
-                `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
-            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
-            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
-            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
-        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
-            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
-            input (see `past_key_values`). This is useful if you want more control over how to convert
-            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
-
-            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
-            of `inputs_embeds`.
-
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-T5_ENCODER_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
-            should be able to pad the inputs on both the right and the left.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for detail.
-
-            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
-        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
-__HEAD_MASK_WARNING_MSG = """
-The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
-`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
-If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
-num_heads)`.
-"""
+         
 
 class T5Model(nn.Module):
     _keys_to_ignore_on_load_unexpected = [
         "decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
     ]
-    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", 
+      "decoder.embed_tokens.weight"]
 
     def __init__(self, 
                 config: T5Config,
@@ -730,13 +567,15 @@ def __init__(self,
         encoder_config.is_decoder = False
         encoder_config.use_cache = False
         encoder_config.is_encoder_decoder = False
-        self.encoder = T5Stack(encoder_config, cache_config, quant_config, self.shared)
+        self.encoder = T5Stack(encoder_config, 
+          cache_config, quant_config, self.shared)
 
         decoder_config = copy.deepcopy(config)
         decoder_config.is_decoder = True
         decoder_config.is_encoder_decoder = False
         decoder_config.num_layers = config.num_decoder_layers
-        self.decoder = T5Stack(decoder_config, cache_config, quant_config, self.shared)
+        self.decoder = T5Stack(decoder_config, 
+          cache_config, quant_config, self.shared)
 
     def forward(
         self,
@@ -744,30 +583,6 @@ def forward(
     encoder_input_ids: torch.Tensor,
     encoder_positions: torch.Tensor, kv_caches: List[torch.Tensor],
     attn_metadata: AttentionMetadata) -> torch.Tensor:
-        r"""
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, T5Model
-
-        >>> tokenizer = AutoTokenizer.from_pretrained("t5-small")
-        >>> model = T5Model.from_pretrained("t5-small")
-
-        >>> input_ids = tokenizer(
-        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
-        ... ).input_ids  # Batch size 1
-        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
-
-        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for T5Model.
-        >>> # This is not needed for torch's T5ForConditionalGeneration as it does this internally using labels arg.
-        >>> decoder_input_ids = model._shift_right(decoder_input_ids)
-
-        >>> # forward pass
-        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-        >>> last_hidden_states = outputs.last_hidden_state
-        ```"""
         encoder_hidden_states = None
 
         if encoder_input_ids.numel() > 0:
@@ -914,7 +729,7 @@ def _rename_layer_types(
     ) -> str:
         for enc_dec, mapping in self.layer_type_mapping.items():
             if enc_dec in name:
-                for layer_num in mapping.keys():
+                for layer_num in mapping:
                     if layer_num in name:
                         name = name.replace(layer_num, mapping[layer_num])
         return name        
@@ -929,40 +744,33 @@ def _rename_stacked_param(
                 return name, mapping["shard_id"]
         return name, None
     
-    # def get_set(self, model_params_dict):
-    #     out = set()
-    #     for key in model_params_dict.keys():
-    #         if "bias" in key:
-    #             print('BBBIIIAAASSSSS..................')
-    #         if 'decoder' not in key and 'encoder' not in key:
-    #             print(key)
-    #         # print(key.split('.'))
-    #         lst = key.split('.') 
-    #         if len(lst)>=4:
-    #             out.add(lst[3])
-    #     return out
     
     def match_weight_name(self, weights_tuple_list):
         out = set()
         for name, _ in weights_tuple_list:
             # print(name)
             if 'decoder' in name and 'layer_norm' not in name:
-                if 'layer.0' in name and 'SelfAttention' not in name:
+                if 'layer.0' in name and 
+                  'SelfAttention' not in name:
                     print(name)
                     out.add(False)
-                elif 'layer.1' in name and 'EncDecAttention' not in name:
+                elif 'layer.1' in name and 
+                  'EncDecAttention' not in name:
                     print(name)
                     out.add(False)
-                elif 'layer.2' in name and 'DenseReluDense' not in name:
+                elif 'layer.2' in name and 
+                  'DenseReluDense' not in name:
                     print(name)
                     out.add(False)
                 else:
                     out.add(True)
             elif 'encoder' in name and 'layer_norm' not in name:
-                if 'layer.0' in name and 'SelfAttention' not in name:
+                if 'layer.0' in name and 
+                  'SelfAttention' not in name:
                     print(name)
                     out.add(False)
-                elif 'layer.1' in name and 'DenseReluDense' not in name:
+                elif 'layer.1' in name and 
+                  'DenseReluDense' not in name:
                     print(name)
                     out.add(False)
                 else:
@@ -974,13 +782,9 @@ def match_weight_name(self, weights_tuple_list):
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
         model_params_dict = dict(self.model.named_parameters())
-        # types = self.get_set(model_params_dict)
-        top_params_dict = dict(self.named_parameters())
-
         weights_tuple_list = list(weights)
 
         shared_embedding_weight = None
-        shared_embedding_shard_id = None
 
         for name, loaded_weight in weights_tuple_list:
             name = self._rename_layer_types(name)
@@ -991,7 +795,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 assert shared_embedding_weight is None, (
                     "Conflicting embedding weights.")
                 shared_embedding_weight = loaded_weight
-                shared_embedding_shard_id = shard_id
             else:
                 # Skip the specific downstream task weight.
                 if name.startswith('cls.'):