diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index cc2739de5a12f..4b2f2dda6cbe8 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -15,7 +15,7 @@
 """Inference-only Idefics3 model compatible with HuggingFace weights."""
 
 import math
-from typing import (Dict, Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
                     TypedDict, Union)
 
 import torch
@@ -104,53 +104,40 @@ def input_mapper_for_idefics3(
     return MultiModalInputs(batch_data)
 
 
-def _resize_output_size_rescale_to_max_len(
-        height: int,
-        width: int,
-        min_len: Optional[int] = 1,
-        max_len: Optional[int] = None) -> Tuple[int, int]:
+def _resize_output_size(height: int,
+                        width: int,
+                        max_len: Optional[int] = None,
+                        min_len: Optional[int] = 1,
+                        max_size: Optional[int] = None) -> Tuple[int, int]:
+    # Set default value for max_len if not provided
     max_len = max(height, width) if max_len is None else max_len
     aspect_ratio = width / height
 
+    # Handle the maximum size constraint
+    if max_size is not None:
+        max_len = min(max_len, max_size)
+
+    # Adjust dimensions according to the aspect ratio
     if width >= height:
         width = max_len
         height = int(width / aspect_ratio)
-        if height % 2 != 0:
-            height += 1
-    elif height > width:
+    else:
         height = max_len
         width = int(height * aspect_ratio)
-        if width % 2 != 0:
-            width += 1
 
-    # Avoid resizing to a size smaller than min_len
+    # Ensure both width and height are even (if needed)
+    height += 1 if height % 2 != 0 else 0
+    width += 1 if width % 2 != 0 else 0
+
+    # Ensure dimensions are not smaller than the minimum length
     height = max(height, min_len)
     width = max(width, min_len)
-    return height, width
-
-
-def _resize_output_size_scale_below_upper_bound(
-        height: int,
-        width: int,
-        max_len: Optional[Dict[str, int]] = None) -> Tuple[int, int]:
-    max_len = max(height, width) if max_len is None else max_len
 
-    aspect_ratio = width / height
-    if width >= height and width > max_len:
-        width = max_len
-        height = int(width / aspect_ratio)
-    elif height > width and height > max_len:
-        height = max_len
-        width = int(height * aspect_ratio)
-
-    # Avoid resizing to a size smaller than 1
-    height = max(height, 1)
-    width = max(width, 1)
     return height, width
 
 
 def _get_resize_output_image_size(
-    image_size,
+    image_size: Tuple[int, int],
     resolution_max_side: int,
     max_image_size: int = 1820,
 ) -> Tuple[int, int]:
@@ -162,17 +149,16 @@ def _get_resize_output_image_size(
 
     # Find the output size, when rescaling the longest edge to max_len and
     # preserving the aspect ratio
-    height, width = _resize_output_size_rescale_to_max_len(
-        height, width, max_len=resolution_max_side)
-    # Find the output size when scaling the image to be below the max_image_size
-    height, width = _resize_output_size_scale_below_upper_bound(
-        height, width, max_len=max_image_size)
+    height, width = _resize_output_size(height,
+                                        width,
+                                        max_len=resolution_max_side)
+
     return height, width
 
 
-def _prompt_split_image(image_seq_len, image_rows, image_cols,
-                        fake_token_around_image, image_token,
-                        global_img_token):
+def _prompt_split_image(image_seq_len: int, image_rows: int, image_cols: int,
+                        fake_token_around_image: str, image_token: str,
+                        global_img_token: str) -> str:
     """
     Prompt with expanded image tokens for when the image is split 
     into patches.
@@ -192,16 +178,16 @@ def _prompt_split_image(image_seq_len, image_rows, image_cols,
     return text_split_images
 
 
-def _prompt_single_image(image_seq_len, fake_token_around_image, image_token,
-                         global_img_token):
+def _prompt_single_image(image_seq_len: int, fake_token_around_image: str,
+                         image_token: str, global_img_token: str):
     """Prompt with expanded image tokens for a single image."""
     return (f"{fake_token_around_image}" + f"{global_img_token}" +
             f"{image_token}" * image_seq_len + f"{fake_token_around_image}")
 
 
-def _get_image_prompt_string(image_rows, image_cols, image_seq_len,
-                             fake_token_around_image, image_token,
-                             global_img_token):
+def _get_image_prompt_string(image_rows: int, image_cols: int,
+                             image_seq_len: int, fake_token_around_image: str,
+                             image_token: str, global_img_token: str):
     if image_rows == 0 and image_cols == 0:
         return _prompt_single_image(
             image_seq_len,