From 51a624bf024e351e678b598521b72a2e19b5e2ef Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 26 Dec 2024 12:23:20 +0800 Subject: [PATCH] [Misc] Move some multimodal utils to modality-specific modules (#11494) Signed-off-by: DarkLight1337 --- .../decoder_only/vision_language/test_awq.py | 2 +- .../vision_language/test_h2ovl.py | 2 +- .../vision_language/test_phi3v.py | 2 +- .../vision_language/test_qwen2_vl.py | 4 +- .../vision_language/vlm_utils/builders.py | 5 +- .../vlm_utils/custom_inputs.py | 5 +- .../vision_language/test_mllama.py | 2 +- tests/multimodal/test_mapper.py | 2 +- vllm/assets/video.py | 2 +- vllm/multimodal/audio.py | 12 ++++ vllm/multimodal/image.py | 12 ++++ vllm/multimodal/utils.py | 68 +------------------ vllm/multimodal/video.py | 43 ++++++++++++ 13 files changed, 84 insertions(+), 77 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py index 6e6e5b40d6a35..18ceb34a4e042 100644 --- a/tests/models/decoder_only/vision_language/test_awq.py +++ b/tests/models/decoder_only/vision_language/test_awq.py @@ -3,7 +3,7 @@ import pytest import torch -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets from ...utils import check_logprobs_close diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py index 45a7365204403..7406df253e7f0 100644 --- a/tests/models/decoder_only/vision_language/test_h2ovl.py +++ b/tests/models/decoder_only/vision_language/test_h2ovl.py @@ -8,7 +8,7 @@ # Import the functions to test from vllm.model_executor.models.h2ovl import (calculate_num_blocks, image_to_pixel_values_wrapper) -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size models = [ "h2oai/h2ovl-mississippi-800m", # Replace with your actual model names diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py index 82eae0705c9ba..3a8934adfb076 100644 --- a/tests/models/decoder_only/vision_language/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/test_phi3v.py @@ -5,7 +5,7 @@ import pytest from transformers import AutoTokenizer -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index 71b6ba4dca435..51fe7d2ad32a8 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -6,8 +6,8 @@ from PIL import Image from vllm.entrypoints.llm import LLM -from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, - sample_frames_from_video) +from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.video import rescale_video_size, sample_frames_from_video from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, PromptVideoInput, VllmRunner) diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py index 66668296139f5..59773be709fa8 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py @@ -5,8 +5,9 @@ import torch -from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, - resize_video, sample_frames_from_video) +from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.video import (rescale_video_size, resize_video, + sample_frames_from_video) from .....conftest import _ImageAssets, _VideoAssets from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER, diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py index e698d8d3f6f56..2291f4fa0d0ac 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py @@ -1,8 +1,9 @@ """Custom input builders for edge-cases in different models.""" from typing import Callable -from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, - resize_video, sample_frames_from_video) +from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.video import (rescale_video_size, resize_video, + sample_frames_from_video) from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS from .builders import build_multi_image_inputs, build_single_image_inputs diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 77dd1d81f84d7..636a3eedff31b 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -6,7 +6,7 @@ from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size from vllm.sequence import SampleLogprobs from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py index 71832acbd17b8..81f2a06182bcc 100644 --- a/tests/multimodal/test_mapper.py +++ b/tests/multimodal/test_mapper.py @@ -6,7 +6,7 @@ from vllm.config import ModelConfig from vllm.multimodal import MultiModalRegistry -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size @pytest.fixture diff --git a/vllm/assets/video.py b/vllm/assets/video.py index e4dcab10466db..e6779935bad17 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -7,7 +7,7 @@ from huggingface_hub import hf_hub_download from PIL import Image -from vllm.multimodal.utils import (sample_frames_from_video, +from vllm.multimodal.video import (sample_frames_from_video, try_import_video_packages) from .base import get_cache_dir diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index c92deddbcb255..314d21b746236 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,3 +1,5 @@ +from typing import Any + import numpy as np import numpy.typing as npt @@ -26,6 +28,16 @@ def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: "There is no default maximum multimodal tokens") +def try_import_audio_packages() -> tuple[Any, Any]: + try: + import librosa + import soundfile + except ImportError as exc: + raise ImportError( + "Please install vllm[audio] for audio support.") from exc + return librosa, soundfile + + def resample_audio( audio: npt.NDArray[np.floating], *, diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 97bbce1ce1570..c705e1a3d1554 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -84,3 +84,15 @@ def _default_input_mapper( def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: return 3000 + + +def rescale_image_size(image: Image.Image, + size_factor: float, + transpose: int = -1) -> Image.Image: + """Rescale the dimensions of an image by a constant factor.""" + new_width = int(image.width * size_factor) + new_height = int(image.height * size_factor) + image = image.resize((new_width, new_height)) + if transpose >= 0: + image = image.transpose(Image.Transpose(transpose)) + return image diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index be9643598448d..1cb9036bdfda3 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -2,7 +2,7 @@ import os from functools import lru_cache from io import BytesIO -from typing import Any, List, Optional, Tuple, TypeVar, Union +from typing import List, Optional, Tuple, TypeVar, Union import numpy as np import numpy.typing as npt @@ -14,7 +14,9 @@ from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer +from .audio import try_import_audio_packages from .inputs import MultiModalDataDict, PlaceholderRange +from .video import try_import_video_packages logger = init_logger(__name__) @@ -198,16 +200,6 @@ async def async_fetch_video(video_url: str, return video -def try_import_audio_packages() -> Tuple[Any, Any]: - try: - import librosa - import soundfile - except ImportError as exc: - raise ImportError( - "Please install vllm[audio] for audio support.") from exc - return librosa, soundfile - - def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]: """ Load audio from a URL. @@ -324,60 +316,6 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image: return _load_image_from_bytes(base64.b64decode(image)) -def rescale_image_size(image: Image.Image, - size_factor: float, - transpose: int = -1) -> Image.Image: - """Rescale the dimensions of an image by a constant factor.""" - new_width = int(image.width * size_factor) - new_height = int(image.height * size_factor) - image = image.resize((new_width, new_height)) - if transpose >= 0: - image = image.transpose(Image.Transpose(transpose)) - return image - - -def try_import_video_packages(): - try: - import cv2 - import decord - except ImportError as exc: - raise ImportError( - "Please install vllm[video] for video support.") from exc - return cv2, decord - - -def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray: - cv2, _ = try_import_video_packages() - - num_frames, _, _, channels = frames.shape - new_height, new_width = size - resized_frames = np.empty((num_frames, new_height, new_width, channels), - dtype=frames.dtype) - for i, frame in enumerate(frames): - resized_frame = cv2.resize(frame, (new_width, new_height)) - resized_frames[i] = resized_frame - return resized_frames - - -def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray: - _, height, width, _ = frames.shape - new_height = int(height * size_factor) - new_width = int(width * size_factor) - - return resize_video(frames, (new_height, new_width)) - - -def sample_frames_from_video(frames: npt.NDArray, - num_frames: int) -> npt.NDArray: - total_frames = frames.shape[0] - if num_frames == -1: - return frames - else: - frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) - sampled_frames = frames[frame_indices, ...] - return sampled_frames - - def encode_video_base64(frames: npt.NDArray) -> str: base64_frames = [] frames_list = [frames[i] for i in range(frames.shape[0])] diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index ba9bf58a4a20c..bfcdef70718bc 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Any, Dict, Optional import numpy as np +import numpy.typing as npt from vllm.inputs.registry import InputContext from vllm.logger import init_logger @@ -75,3 +76,45 @@ def _default_input_mapper( def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: return 4096 + + +def try_import_video_packages() -> tuple[Any, Any]: + try: + import cv2 + import decord + except ImportError as exc: + raise ImportError( + "Please install vllm[video] for video support.") from exc + return cv2, decord + + +def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray: + cv2, _ = try_import_video_packages() + + num_frames, _, _, channels = frames.shape + new_height, new_width = size + resized_frames = np.empty((num_frames, new_height, new_width, channels), + dtype=frames.dtype) + for i, frame in enumerate(frames): + resized_frame = cv2.resize(frame, (new_width, new_height)) + resized_frames[i] = resized_frame + return resized_frames + + +def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray: + _, height, width, _ = frames.shape + new_height = int(height * size_factor) + new_width = int(width * size_factor) + + return resize_video(frames, (new_height, new_width)) + + +def sample_frames_from_video(frames: npt.NDArray, + num_frames: int) -> npt.NDArray: + total_frames = frames.shape[0] + if num_frames == -1: + return frames + + frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) + sampled_frames = frames[frame_indices, ...] + return sampled_frames