diff --git a/tests/conftest.py b/tests/conftest.py index b11bbcb4ab7d1..6adff5e2328c4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -232,20 +232,22 @@ def video_assets() -> _VideoAssets: return VIDEO_ASSETS -_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature) +_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict) class HfRunner: - def wrap_device(self, input: _T, device: Optional[str] = None) -> _T: + def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: if device is None: - return self.wrap_device( - input, "cpu" if current_platform.is_cpu() else "cuda") + device = "cpu" if current_platform.is_cpu() else "cuda" - if hasattr(input, "device") and input.device.type == device: - return input + if isinstance(x, dict): + return {k: self.wrap_device(v, device) for k, v in x.items()} - return input.to(device) + if hasattr(x, "device") and x.device.type == device: + return x + + return x.to(device) def __init__( self, diff --git a/tests/models/decoder_only/vision_language/test_chameleon.py b/tests/models/decoder_only/vision_language/test_chameleon.py index 8334451970a4f..4bd678b9f21c4 100644 --- a/tests/models/decoder_only/vision_language/test_chameleon.py +++ b/tests/models/decoder_only/vision_language/test_chameleon.py @@ -1,6 +1,7 @@ from typing import List, Optional, Type import pytest +import transformers from transformers import AutoModelForVision2Seq, BatchEncoding from vllm.multimodal.utils import rescale_image_size @@ -93,6 +94,10 @@ def process(hf_inputs: BatchEncoding): ) +@pytest.mark.skipif( + transformers.__version__.startswith("4.46.0"), + reason="Model broken in HF, see huggingface/transformers#34379", +) @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "size_factors", diff --git a/tests/models/decoder_only/vision_language/test_minicpmv.py b/tests/models/decoder_only/vision_language/test_minicpmv.py index 1d4e752052273..d3a0561f65797 100644 --- a/tests/models/decoder_only/vision_language/test_minicpmv.py +++ b/tests/models/decoder_only/vision_language/test_minicpmv.py @@ -32,8 +32,8 @@ models = ["openbmb/MiniCPM-Llama3-V-2_5"] -def _wrap_inputs(hf_inputs: BatchEncoding) -> BatchEncoding: - return BatchEncoding({"model_inputs": hf_inputs}) +def _wrap_inputs(hf_inputs: BatchEncoding): + return {"model_inputs": hf_inputs} def trunc_hf_output(hf_output: Tuple[List[int], str, diff --git a/tests/models/decoder_only/vision_language/test_paligemma.py b/tests/models/decoder_only/vision_language/test_paligemma.py index d7e29ea76ba4e..a3ca0845e5ff8 100644 --- a/tests/models/decoder_only/vision_language/test_paligemma.py +++ b/tests/models/decoder_only/vision_language/test_paligemma.py @@ -2,11 +2,12 @@ from typing import List, Optional, Tuple, Type import pytest -from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer +from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, + BatchEncoding) from vllm.multimodal.utils import rescale_image_size from vllm.sequence import SampleLogprobs -from vllm.utils import is_hip +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, is_hip from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from ...utils import check_logprobs_close @@ -74,6 +75,7 @@ def run_test( Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. """ + torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] images = [asset.pil_image for asset in image_assets] inputs_per_image = [( @@ -100,7 +102,14 @@ def run_test( for prompts, images in inputs_per_image ] - with hf_runner(model, dtype=dtype, + def process(hf_inputs: BatchEncoding): + hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \ + .to(torch_dtype) # type: ignore + return hf_inputs + + with hf_runner(model, + dtype=dtype, + postprocess_inputs=process, auto_cls=AutoModelForVision2Seq) as hf_model: hf_outputs_per_image = [ hf_model.generate_greedy_logprobs_limit(prompts,