Skip to content

Commit

Permalink
Fix format
Browse files Browse the repository at this point in the history
Signed-off-by: Jee Jee Li <[email protected]>
  • Loading branch information
jeejeelee committed Nov 5, 2024
1 parent 56f0572 commit 469a96e
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 12 deletions.
6 changes: 1 addition & 5 deletions tests/models/decoder_only/vision_language/test_idefics3.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import List, Optional, Tuple, Type

import pytest
from transformers import AutoTokenizer
from transformers import AutoModelForVision2Seq, AutoTokenizer

from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs
Expand Down Expand Up @@ -61,10 +61,6 @@ def run_test(
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# HACK - this is an attempted workaround for the following bug
# https://github.com/huggingface/transformers/issues/34307
from transformers import AutoModelForVision2Seq # noqa: F401

# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
Expand Down
15 changes: 8 additions & 7 deletions vllm/model_executor/models/idefics3.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def get_max_idefics3_image_tokens(ctx: InputContext,


def dummy_data_for_idefics3(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]):
mm_counts: Mapping[str, int]) -> DummyData:
hf_config = ctx.get_hf_config()
num_images = mm_counts["image"]

Expand All @@ -345,7 +345,7 @@ def __init__(self, config):
output_size = config.text_config.hidden_size
self.proj = ReplicatedLinear(input_size, output_size, bias=False)

def forward(self, x) -> torch.Tensor:
def forward(self, x: torch.Tensor) -> torch.Tensor:
out, _ = self.proj(x)
return out

Expand All @@ -357,7 +357,9 @@ def __init__(self, config):
self.scale_factor = config.scale_factor
self.modality_projection = Idefics3SimpleMLP(config)

def pixel_shuffle(self, x, scale_factor=2):
def pixel_shuffle(self,
x: torch.Tensor,
scale_factor: int = 2) -> torch.Tensor:
bsz, seq, embed_dim = x.size()
height = width = int(seq**0.5)
x = x.view(bsz, height, width, embed_dim)
Expand All @@ -375,7 +377,7 @@ def pixel_shuffle(self, x, scale_factor=2):
embed_dim * (scale_factor**2))
return x

def forward(self, image_hidden_states):
def forward(self, image_hidden_states: torch.Tensor) -> torch.Tensor:
image_hidden_states = self.pixel_shuffle(image_hidden_states,
self.scale_factor)
image_hidden_states = self.modality_projection(image_hidden_states)
Expand Down Expand Up @@ -467,7 +469,6 @@ def _parse_and_validate_image_input(

def _image_pixels_to_features(
self,
vision_model: Idefics3VisionTransformer,
pixel_values: torch.Tensor,
pixel_attention_mask: Optional[torch.BoolTensor] = None,
) -> torch.Tensor:
Expand Down Expand Up @@ -511,7 +512,7 @@ def _image_pixels_to_features(
patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()

# Get sequence from the vision encoder
image_hidden_states = vision_model(
image_hidden_states = self.vision_model(
pixel_values=pixel_values,
patch_attention_mask=patch_attention_mask,
)
Expand All @@ -525,7 +526,7 @@ def _process_image_pixels(
pixel_values = inputs["data"]
pixel_attention_mask = inputs["pixel_attention_mask"]

return self._image_pixels_to_features(self.vision_model, pixel_values,
return self._image_pixels_to_features(pixel_values,
pixel_attention_mask)

def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor:
Expand Down

0 comments on commit 469a96e

Please sign in to comment.