Avoid calling get_max_length (huggingface#34971)

fix Co-authored-by: ydshieh <[email protected]>
BernardZach · Dec 5, 2024 · 52f9cd1 · 52f9cd1
1 parent 65a5d77
commit 52f9cd1
Show file tree

Hide file tree

Showing 4 changed files with 4 additions and 4 deletions.
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
@@ -911,7 +911,7 @@ def prepare_inputs_for_generation(
         # This part differs from other models because BLOOM needs a 2D mask to construct alibi tensor
         # The only difference is the usage of 2D instead of 4D mask, but the shape will be static
         if isinstance(past_key_values, StaticCache) and attention_mask is not None:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
             batch_size, seq_length = attention_mask.shape
             diff = target_length - seq_length
 

diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py
@@ -909,7 +909,7 @@ def _update_causal_mask(
         min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]

diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py
@@ -1142,7 +1142,7 @@ def _update_causal_mask(
         min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]

diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -353,7 +353,7 @@ def _update_causal_mask(
         min_dtype = torch.finfo(dtype).min
         sequence_length = inputs_embeds.shape[1]
         if using_static_cache:
-            target_length = past_key_values.get_max_length()
+            target_length = past_key_values.get_max_cache_shape()
         else:
             target_length = (
                 attention_mask.shape[-1]