Fix format

Signed-off-by: Jee Jee Li <[email protected]>
vllm-project · Nov 5, 2024 · 469a96e · 469a96e
1 parent 56f0572
commit 469a96e
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 12 deletions.
diff --git a/tests/models/decoder_only/vision_language/test_idefics3.py b/tests/models/decoder_only/vision_language/test_idefics3.py
@@ -1,7 +1,7 @@
 from typing import List, Optional, Tuple, Type
 
 import pytest
-from transformers import AutoTokenizer
+from transformers import AutoModelForVision2Seq, AutoTokenizer
 
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
@@ -61,10 +61,6 @@ def run_test(
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
-    # HACK - this is an attempted workaround for the following bug
-    # https://github.com/huggingface/transformers/issues/34307
-    from transformers import AutoModelForVision2Seq  # noqa: F401
-
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it

diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
@@ -318,7 +318,7 @@ def get_max_idefics3_image_tokens(ctx: InputContext,
 
 
 def dummy_data_for_idefics3(ctx: InputContext, seq_len: int,
-                            mm_counts: Mapping[str, int]):
+                            mm_counts: Mapping[str, int]) -> DummyData:
     hf_config = ctx.get_hf_config()
     num_images = mm_counts["image"]
 
@@ -345,7 +345,7 @@ def __init__(self, config):
         output_size = config.text_config.hidden_size
         self.proj = ReplicatedLinear(input_size, output_size, bias=False)
 
-    def forward(self, x) -> torch.Tensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         out, _ = self.proj(x)
         return out
 
@@ -357,7 +357,9 @@ def __init__(self, config):
         self.scale_factor = config.scale_factor
         self.modality_projection = Idefics3SimpleMLP(config)
 
-    def pixel_shuffle(self, x, scale_factor=2):
+    def pixel_shuffle(self,
+                      x: torch.Tensor,
+                      scale_factor: int = 2) -> torch.Tensor:
         bsz, seq, embed_dim = x.size()
         height = width = int(seq**0.5)
         x = x.view(bsz, height, width, embed_dim)
@@ -375,7 +377,7 @@ def pixel_shuffle(self, x, scale_factor=2):
                       embed_dim * (scale_factor**2))
         return x
 
-    def forward(self, image_hidden_states):
+    def forward(self, image_hidden_states: torch.Tensor) -> torch.Tensor:
         image_hidden_states = self.pixel_shuffle(image_hidden_states,
                                                  self.scale_factor)
         image_hidden_states = self.modality_projection(image_hidden_states)
@@ -467,7 +469,6 @@ def _parse_and_validate_image_input(
 
     def _image_pixels_to_features(
         self,
-        vision_model: Idefics3VisionTransformer,
         pixel_values: torch.Tensor,
         pixel_attention_mask: Optional[torch.BoolTensor] = None,
     ) -> torch.Tensor:
@@ -511,7 +512,7 @@ def _image_pixels_to_features(
         patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
 
         # Get sequence from the vision encoder
-        image_hidden_states = vision_model(
+        image_hidden_states = self.vision_model(
             pixel_values=pixel_values,
             patch_attention_mask=patch_attention_mask,
         )
@@ -525,7 +526,7 @@ def _process_image_pixels(
         pixel_values = inputs["data"]
         pixel_attention_mask = inputs["pixel_attention_mask"]
 
-        return self._image_pixels_to_features(self.vision_model, pixel_values,
+        return self._image_pixels_to_features(pixel_values,
                                               pixel_attention_mask)
 
     def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor: