From 0a648b2400206deb3bbd895bee96d851dbd3ab68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Fri, 23 Aug 2024 16:59:51 +0800 Subject: [PATCH 01/34] Add support to Qwen2-VL. --- vllm/config.py | 9 +- .../model_executor/layers/rotary_embedding.py | 171 ++++- vllm/model_executor/models/__init__.py | 2 + vllm/model_executor/models/qwen2_vl.py | 715 ++++++++++++++++++ vllm/multimodal/registry.py | 3 +- vllm/multimodal/video.py | 17 + vllm/sequence.py | 11 + vllm/worker/model_runner.py | 96 ++- 8 files changed, 1008 insertions(+), 16 deletions(-) create mode 100644 vllm/model_executor/models/qwen2_vl.py create mode 100644 vllm/multimodal/video.py diff --git a/vllm/config.py b/vllm/config.py index 4cbdde5e113a2..59ce868a98f7a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -690,7 +690,7 @@ class LoadConfig: ignore_patterns: The list of patterns to ignore when loading the model. Default to "original/**/*" to avoid repeated loading of llama's checkpoints. - + """ load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO @@ -1647,8 +1647,11 @@ def _get_and_verify_max_len( "with rope_scaling. Please raise an issue so we can " "investigate.") - assert "factor" in rope_scaling - scaling_factor = rope_scaling["factor"] + if rope_type == "mrope": + scaling_factor = 1 + else: + assert "factor" in rope_scaling + scaling_factor = rope_scaling["factor"] if rope_type == "yarn": derived_max_model_len = rope_scaling[ "original_max_position_embeddings"] diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 0562b71aa7493..1b454e3cebf3c 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -765,6 +765,170 @@ def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: return new_freqs +class MRotaryEmbedding(RotaryEmbedding): + """Rotary Embedding with Multimodal Sections.""" + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + dtype: torch.dtype, + mrope_section: List[int] = None, + ) -> None: + super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype) + + self.mrope_section = mrope_section + if self.mrope_section: + assert sum(self.mrope_section) == rotary_dim // 2 + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """PyTorch-native implementation equivalent to forward().""" + + qk_ndim_in = query.ndim + + query = query.view(*query.shape[:-1], -1, self.head_size) + key = key.view(*key.shape[:-1], -1, self.head_size) + + query_rot = query[..., :self.rotary_dim] + key_rot = key[..., :self.rotary_dim] + if self.rotary_dim < self.head_size: + query_pass = query[..., self.rotary_dim:] + key_pass = key[..., self.rotary_dim:] + + cos_sin = self.cos_sin_cache[positions] + cos, sin = cos_sin.chunk(2, dim=-1) + if self.mrope_section and positions.ndim == query.ndim - 1: + cos = torch.cat([m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))], dim=-1) + sin = torch.cat([m[i] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))], dim=-1) + + if self.is_neox_style: + # NOTE(woosuk): Here we assume that the positions tensor has the + # shape [batch_size, seq_len]. + cos = cos.repeat(1, 1, 2).unsqueeze(-2) + sin = sin.repeat(1, 1, 2).unsqueeze(-2) + else: + cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) + sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) + + rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj + query_rot = query_rot * cos + rotate_fn(query_rot) * sin + key_rot = key_rot * cos + rotate_fn(key_rot) * sin + + if self.rotary_dim < self.head_size: + query = torch.cat((query_rot, query_pass), dim=-1) + key = torch.cat((key_rot, key_pass), dim=-1) + else: + query = query_rot + key = key_rot + + query = query.flatten(-2) + key = key.flatten(-2) + if query.ndim > qk_ndim_in: + query = query.squeeze(0) + key = key.squeeze(1) + + return query, key + + @staticmethod + def get_input_positions( + input_tokens: List[int], + image_grid_thw: Union[List[List[int]], torch.Tensor], + video_grid_thw: Union[List[List[int]], torch.Tensor], + image_token_id: int, + video_token_id: int, + vision_start_token_id: int, + vision_end_token_id: int, + spatial_merge_size: int, + context_len: int = 0, + ) -> Tuple[List[List[int]], int]: + """Get mrope input positions and delta value.""" + + if torch.is_tensor(image_grid_thw): + image_grid_thw = image_grid_thw.tolist() + if torch.is_tensor(video_grid_thw): + video_grid_thw = video_grid_thw.tolist() + + input_tokens_tensor = torch.tensor(input_tokens) + vision_start_indices = torch.argwhere(input_tokens_tensor == vision_start_token_id).squeeze(1) + vision_tokens = input_tokens_tensor[vision_start_indices + 1] + image_nums = (vision_tokens == image_token_id).sum() + video_nums = (vision_tokens == video_token_id).sum() + llm_pos_ids_list: list = [] + + st = 0 + remain_images, remain_videos = image_nums, video_nums + + image_index, video_index = 0, 0 + for _ in range(image_nums + video_nums): + if image_token_id in input_tokens and remain_images > 0: + ed_image = input_tokens.index(image_token_id, st) + else: + ed_image = len(input_tokens) + 1 + if video_token_id in input_tokens and remain_videos > 0: + ed_video = input_tokens.index(video_token_id, st) + else: + ed_video = len(input_tokens) + 1 + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_index += 1 + remain_videos -= 1 + ed = ed_video + llm_grid_t, llm_grid_h, llm_grid_w = t, h // spatial_merge_size, w // spatial_merge_size + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten() + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + llm_positions = llm_positions[:, context_len:] + mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() + + return llm_positions.tolist(), mrope_position_delta + + @staticmethod + def get_next_input_positions( + mrope_position_delta: int, + context_len: int, + seq_len: int, + ) -> List[List[int]]: + return [ + list(range(context_len + mrope_position_delta, seq_len + mrope_position_delta)) + for _ in range(3) + ] + + _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {} @@ -805,7 +969,7 @@ def get_rope( # The correct one should be "longrope" but keep "su" here # for backward compatible if scaling_type not in {"su", "longrope"}: - scaling_factor = rope_scaling["factor"] + scaling_factor = rope_scaling.get("factor") if scaling_type == "llama3": low_freq_factor = rope_scaling["low_freq_factor"] high_freq_factor = rope_scaling["high_freq_factor"] @@ -869,6 +1033,11 @@ def get_rope( head_size, rotary_dim, max_position, original_max_position, base, is_neox_style, dtype, short_factor, long_factor, **extra_kwargs) + elif scaling_type == "mrope": + return MRotaryEmbedding( + head_size, rotary_dim, max_position, base, is_neox_style, dtype, + mrope_section=rope_scaling["mrope_section"], + ) else: raise ValueError(f"Unknown RoPE scaling type {scaling_type}") _ROPE_DICT[key] = rotary_emb diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 8591c276b0013..28665e8f51db1 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -52,6 +52,7 @@ "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), + "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), "RWForCausalLM": ("falcon", "FalconForCausalLM"), "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"), "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"), @@ -85,6 +86,7 @@ "PaliGemmaForConditionalGeneration"), "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "UltravoxModel": ("ultravox", "UltravoxModel"), + "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), } _CONDITIONAL_GENERATION_MODELS = { "BartModel": ("bart", "BartForConditionalGeneration"), diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py new file mode 100644 index 0000000000000..3bf3ad1c012df --- /dev/null +++ b/vllm/model_executor/models/qwen2_vl.py @@ -0,0 +1,715 @@ +# coding=utf-8 +# Adapted from +# TODO: link to transformers modeling file +# Copyright 2024 The Qwen team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Qwen2-VL model compatible with HuggingFace weights.""" +import math +from array import array +from collections.abc import Mapping +from functools import partial, lru_cache +from typing import Tuple, Optional, List, Iterable, Any, Dict, Type + +import torch +import torch.nn as nn +import torch.nn.functional as F +from PIL import Image +from einops import rearrange, repeat +# from vllm_flash_attn.flash_attn_interface import flash_attn_varlen_func +from flash_attn import flash_attn_varlen_func +from transformers import Qwen2VLConfig +from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig +from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize + +from vllm.attention import AttentionMetadata +from vllm.config import MultiModalConfig, CacheConfig +from vllm.distributed import parallel_state +from vllm.distributed import utils as dist_utils +from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs +from vllm.logger import init_logger +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.interfaces import SupportsMultiModal +from vllm.model_executor.models.qwen2 import Qwen2Model +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalInputs +from vllm.multimodal.base import MultiModalData +from vllm.multimodal.image import cached_get_image_processor +from vllm.sequence import SequenceData, SamplerOutput, IntermediateTensors, VLLM_TOKEN_ID_ARRAY_TYPE +from vllm.utils import is_list_of + +logger = init_logger(__name__) + + +# === Vision Encoder === # + + +def quick_gelu(x: torch.Tensor, inplace: bool = False) -> torch.Tensor: + return x * torch.sigmoid(1.702 * x) + + +class QuickGELU(nn.Module): + """Applies the Gaussian Error Linear Units function (w/ dummy inplace arg)""" + + def __init__(self, inplace: bool = False) -> None: + super(QuickGELU, self).__init__() + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return quick_gelu(input) + + +class Qwen2VisionMLP(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: int = None, + act_layer: Type[nn.Module] = QuickGELU, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.fc1 = ColumnParallelLinear(in_features, hidden_features, quant_config=quant_config) + self.act = act_layer() + self.fc2 = RowParallelLinear(hidden_features, in_features, quant_config=quant_config) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x_parallel, _ = self.fc1(x) + x_parallel = self.act(x_parallel) + x, _ = self.fc2(x_parallel) + return x + + +def rotate_half(x, interleaved=False): + if not interleaved: + x1, x2 = x.chunk(2, dim=-1) + return torch.cat((-x2, x1), dim=-1) + else: + x1, x2 = x[..., ::2], x[..., 1::2] + return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2) + + +def apply_rotary_emb_torch(x, cos, sin, interleaved=False): + """ + x: (batch_size, seqlen, nheads, headdim) + cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2) + """ + ro_dim = cos.shape[-1] * 2 + assert ro_dim <= x.shape[-1] + cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") + sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") + return torch.cat( + [x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]], + dim=-1, + ) + + +def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: + t_ = t.float() + cos = freqs.cos() + sin = freqs.sin() + output = apply_rotary_emb_torch(t_, cos, sin).type_as(t) + return output + + +class Qwen2VisionAttention(nn.Module): + def __init__( + self, + embed_dim: Optional[int] = None, + num_heads: Optional[int] = None, + projection_size: Optional[int] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + # Per attention head and per partition values. + world_size = parallel_state.get_tensor_model_parallel_world_size() + self.hidden_size_per_attention_head = dist_utils.divide(projection_size, num_heads) + self.num_attention_heads_per_partition = dist_utils.divide(num_heads, world_size) + + self.qkv = ColumnParallelLinear( + input_size=embed_dim, output_size=3 * projection_size, quant_config=quant_config) + self.proj = RowParallelLinear(input_size=projection_size, output_size=embed_dim, quant_config=quant_config) + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor = None, + ) -> torch.Tensor: + # [s, b, c] --> [s, b, head * 3 * head_dim] + x, _ = self.qkv(x) + + # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim] + new_x_shape = x.size()[:-1] + ( + self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head, + ) + x = x.view(*new_x_shape) + + # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim] + q, k, v = dist_utils.split_tensor_along_last_dim(x, 3) + batch_size = q.shape[1] + + q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous() for x in (q, k, v)] + if rotary_pos_emb is not None: + q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) + k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) + q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]] + + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + output = flash_attn_varlen_func( + q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen, 0, causal=False + ) + + context_layer = rearrange(output, '(b s) ... -> b s ...', b=batch_size) + context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous() + + output, _ = self.proj(context_layer) + return output + + +class Qwen2VisionBlock(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float, + act_layer: Type[nn.Module] = QuickGELU, + norm_layer: Type[nn.Module] = partial(nn.LayerNorm, eps=1e-6), + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.norm1 = norm_layer(dim) + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + + self.attn = Qwen2VisionAttention( + embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config + ) + self.mlp = Qwen2VisionMLP(dim, mlp_hidden_dim, act_layer=act_layer, quant_config=quant_config) + + def forward(self, x, cu_seqlens, rotary_pos_emb) -> torch.Tensor: + x = x + self.attn(self.norm1(x), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb) + x = x + self.mlp(self.norm2(x)) + return x + + +class Qwen2VisionPatchEmbed(nn.Module): + def __init__( + self, + patch_size: int = 14, + temporal_patch_size: int = 2, + in_chans: int = 3, + embed_dim: int = 1152, + ) -> None: + super().__init__() + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.embed_dim = embed_dim + + kernel_size = [temporal_patch_size, patch_size, patch_size] + self.proj = nn.Conv3d( + in_chans, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + L, C = x.shape + x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size) + x = self.proj(x).view(L, self.embed_dim) + return x + + +class Qwen2VisionPatchMerger(nn.Module): + def __init__( + self, + d_model: int, + context_dim: int, + norm_layer: Type[nn.Module] = partial(nn.LayerNorm, eps=1e-6), + spatial_merge_size: int = 2, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.hidden_size = context_dim * (spatial_merge_size ** 2) + self.ln_q = norm_layer(context_dim) + self.mlp = nn.ModuleList([ + ColumnParallelLinear(self.hidden_size, self.hidden_size, bias=True, quant_config=quant_config), + nn.GELU(), + RowParallelLinear(self.hidden_size, d_model, bias=True, quant_config=quant_config), + ]) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.ln_q(x) + x = x.view(-1, self.hidden_size) + + mlp_fc1, mlp_act, mlp_fc2 = self.mlp + x_parallel, _ = mlp_fc1(x) + x_parallel = mlp_act(x_parallel) + out, _ = mlp_fc2(x_parallel) + return out + + +class Qwen2VisionRotaryEmbedding(nn.Module): + def __init__(self, dim: int, theta: float = 10000.0) -> None: + super().__init__() + self.dim = dim + self.theta = theta + inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._freqs_cached = None + + def update_freqs_cache(self, seqlen: int) -> None: + if seqlen > self._seq_len_cached: + seqlen *= 2 + self._seq_len_cached = seqlen + self.inv_freq = 1.0 / ( + self.theta + ** ( + torch.arange(0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device) + / self.dim + ) + ) + seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype) + freqs = torch.outer(seq, self.inv_freq) + self._freqs_cached = freqs + + def forward(self, seqlen: int) -> torch.Tensor: + self.update_freqs_cache(seqlen) + return self._freqs_cached[:seqlen] + + +class Qwen2VisionTransformer(nn.Module): + def __init__( + self, + vision_config: Qwen2VLVisionConfig, + norm_eps: float = 1e-6, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + + patch_size: int = vision_config.patch_size + temporal_patch_size: int = vision_config.temporal_patch_size + spatial_merge_size: int = vision_config.spatial_merge_size + in_chans: int = vision_config.in_chans + hidden_size: int = vision_config.hidden_size + embed_dim: int = vision_config.embed_dim + depth: int = vision_config.depth + num_heads: int = vision_config.num_heads + mlp_ratio: float = vision_config.mlp_ratio + + self.spatial_merge_size = spatial_merge_size + + self.patch_embed = Qwen2VisionPatchEmbed( + patch_size=patch_size, + temporal_patch_size=temporal_patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + ) + + norm_layer = partial(nn.LayerNorm, eps=norm_eps) + head_dim = embed_dim // num_heads + self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2) + + self.blocks = nn.ModuleList( + [ + Qwen2VisionBlock( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + norm_layer=norm_layer, + quant_config=quant_config, + ) + for _ in range(depth) + ] + ) + self.merger = Qwen2VisionPatchMerger( + d_model=hidden_size, + context_dim=embed_dim, + norm_layer=norm_layer, + quant_config=quant_config, + ) + + def get_dtype(self) -> torch.dtype: + return self.blocks[0].mlp.fc2.weight.dtype + + def get_device(self) -> torch.device: + return self.blocks[0].mlp.fc2.weight.device + + def rot_pos_emb(self, grid_thw): + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, self.spatial_merge_size, + w // self.spatial_merge_size, self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, self.spatial_merge_size, + w // self.spatial_merge_size, self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb + + def forward( + self, + x: torch.Tensor, + grid_thw: torch.Tensor, + ) -> torch.Tensor: + # patchify + x = x.to(device=self.get_device(), dtype=self.get_dtype()) + x = self.patch_embed(x) + + # compute position embedding + rotary_pos_emb = self.rot_pos_emb(grid_thw) + + # compute cu_seqlens + cu_seqlens = torch.repeat_interleave( + grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] + ).cumsum(dim=0, dtype=torch.int32) + cu_seqlens = F.pad(cu_seqlens, (1, 0), 'constant', 0) + + # transformers + x = x.unsqueeze(1) + for blk in self.blocks: + x = blk(x, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb) + + # adapter + x = self.merger(x) + return x + + +# === Vision input helpers === # + + +def get_processor( + processor_name: str, + *args, + trust_remote_code: bool = False, + **kwargs, +): + """Gets a processor for the given model name via HuggingFace. + + Derived from `vllm.transformers_utils.image_processor.get_image_processor`. + """ + # don't put this import at the top level + # it will call torch.cuda.device_count() + from transformers import AutoProcessor + + try: + processor = AutoProcessor.from_pretrained( + processor_name, + *args, + trust_remote_code=trust_remote_code, + **kwargs) + except ValueError as e: + # If the error pertains to the processor class not existing or not + # currently being imported, suggest using the --trust-remote-code flag. + # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors + if not trust_remote_code: + err_msg = ( + "Failed to load the processor. If the processor is " + "a custom processor not yet available in the HuggingFace " + "transformers library, consider setting " + "`trust_remote_code=True` in LLM or using the " + "`--trust-remote-code` flag in the CLI.") + raise RuntimeError(err_msg) from e + else: + raise e + + return processor + + +cached_get_processor = lru_cache(get_processor) + +MAX_TEMPORAL_IMAGE_NUM = 10 + + +def mm_input_mapper_for_qwen2_vl( + ctx: InputContext, + data: MultiModalData[object], + data_type_key: str, +) -> MultiModalInputs: + """Input mapper for Qwen2-VL.""" + model_config = ctx.model_config + image_processor = cached_get_image_processor( + model_config.model, trust_remote_code=model_config.trust_remote_code) + if image_processor is None: + raise RuntimeError("No HuggingFace processor is available " + "to process the image object") + + images = None + videos = None + if data_type_key == "image": + images = data + else: + assert data_type_key == "video" + videos = data + + try: + batch_data = image_processor \ + .preprocess(images=images, videos=videos, return_tensors="pt") \ + .data + except Exception: + logger.error("Failed to process image (%s)", data) + raise + + return MultiModalInputs(batch_data) + + +image_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl, data_type_key="image") +video_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl, data_type_key="video") + + +def _get_max_image_info(image_processor, data_type_key: str = 'image', mm_count: int = 1): + max_resized_height, max_resized_width = smart_resize( + height=9999999, width=9999999, + factor=image_processor.patch_size * image_processor.merge_size, + + # Limit min / max pixels. + min_pixels=max(image_processor.min_pixels, 28 * 28), + max_pixels=min(image_processor.max_pixels, 1280 * 28 * 28), + ) + + if data_type_key == 'image': + max_grid_t = mm_count + else: + assert data_type_key == 'video' + max_grid_t = max(mm_count // image_processor.temporal_patch_size, 1) + + max_grid_h = max_resized_height // image_processor.patch_size + max_grid_w = max_resized_width // image_processor.patch_size + max_image_tokens = max_grid_t * max_grid_h * max_grid_w + max_llm_image_tokens = max_image_tokens // image_processor.merge_size // image_processor.merge_size + + return max_resized_height, max_resized_width, max_llm_image_tokens + + +def get_max_qwen2_vl_mm_tokens(ctx: InputContext, data_type_key: str) -> int: + image_processor = cached_get_image_processor(ctx.model_config.model) + max_resized_height, max_resized_width, max_llm_image_tokens = _get_max_image_info( + image_processor, data_type_key=data_type_key, mm_count=1) + return max_llm_image_tokens + + +get_max_qwen2_vl_image_tokens = partial(get_max_qwen2_vl_mm_tokens, data_type_key="image") +get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens, data_type_key="video") + + +def dummy_data_for_qwen2_vl( + ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int] +) -> Tuple[SequenceData, Optional[MultiModalDataDict]]: + image_processor = cached_get_image_processor(ctx.model_config.model) + + num_images = mm_counts["image"] + max_resized_height, max_resized_width, max_llm_image_tokens = _get_max_image_info( + image_processor, data_type_key="image", mm_count=num_images) + if seq_len - max_llm_image_tokens - 2 < 0: + raise RuntimeError( + f'Qwen2-VL cannot process {num_images} images in a prompt, ' + f'please increase max_model_len or reduce image limit by --limit-mm-per-prompt.') + + # Check video counts. + num_videos = mm_counts["video"] + max_resized_height, max_resized_width, max_llm_video_tokens = _get_max_image_info( + image_processor, data_type_key="video", mm_count=num_videos) + if seq_len - max_llm_video_tokens - 2 < 0: + raise RuntimeError( + f'Qwen2-VL cannot process {num_images} videos in a prompt, ' + f'please increase max_model_len or reduce video limit by --limit-mm-per-prompt.') + + hf_config = ctx.get_hf_config(Qwen2VLConfig) + token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [hf_config.vision_start_token_id]) + token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, [hf_config.image_token_id]) * max_llm_image_tokens + token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, [hf_config.vision_end_token_id]) + token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, [0]) * (seq_len - max_llm_image_tokens - 2) + dummy_seqdata = SequenceData(token_ids) + dummy_image = Image.new("RGB", (max_resized_width, max_resized_height), color=0) + + return dummy_seqdata, {"image": dummy_image if num_images == 1 else [dummy_image] * num_images} + + +def input_processor_for_qwen2_vl(ctx: InputContext, llm_inputs: LLMInputs) -> LLMInputs: + multi_modal_data = llm_inputs.get("multi_modal_data", None) + if multi_modal_data is None: + return llm_inputs + + image_inputs = multi_modal_data.get("image", None) + video_inputs = multi_modal_data.get("video", None) + + processor = cached_get_processor(ctx.model_config.model) + inputs = processor(text=[llm_inputs['prompt']], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt") + + return LLMInputs( + prompt_token_ids=inputs['input_ids'][0].tolist(), + prompt=llm_inputs['prompt'], + multi_modal_data=multi_modal_data, + ) + + +@MULTIMODAL_REGISTRY.register_image_input_mapper(image_input_mapper_for_qwen2_vl) +@MULTIMODAL_REGISTRY.register_input_mapper("video", video_input_mapper_for_qwen2_vl) +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_vl_image_tokens) +@MULTIMODAL_REGISTRY.register_max_multimodal_tokens("video", get_max_qwen2_vl_video_tokens) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_vl) +@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_vl) +class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal): + def __init__(self, + config: Qwen2VLConfig, + multimodal_config: MultiModalConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None) -> None: + super().__init__() + + self.config = config + self.multimodal_config = multimodal_config + + self.visual = Qwen2VisionTransformer( + config.vision_config, + norm_eps=getattr(config, 'rms_norm_eps', 1e-6), + quant_config=quant_config, + ) + + self.model = Qwen2Model(config, cache_config, quant_config) + + if config.tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config) + + self.logits_processor = LogitsProcessor(config.vocab_size) + self.sampler = Sampler() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: object, + ) -> SamplerOutput: + pixel_values: torch.Tensor = kwargs.get('pixel_values', None) + image_grid_thw: torch.Tensor = kwargs.get('image_grid_thw', None) + pixel_values_video: torch.Tensor = kwargs.get('pixel_values_video', None) + video_grid_thw: torch.Tensor = kwargs.get('video_grid_thw', None) + + no_vision = pixel_values is None and pixel_values_video is None + + if no_vision: + inputs_embeds = None + else: + inputs_embeds = self.model.embed_tokens(input_ids) + + if getattr(self.config, "rope_scaling", {}).get("type", None) == "mrope": + assert positions.ndim == 2 and positions.size(0) == 3, \ + f"multimodal section rotary embedding requires (3, seq_len) positions, but got {positions.size()}" + + if pixel_values is not None: + pixel_values = pixel_values.type(self.visual.get_dtype()) + image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw) + image_mask = (input_ids == self.config.image_token_id) + inputs_embeds[image_mask, :] = image_embeds + if pixel_values_video is not None: + pixel_values_video = pixel_values_video.type(self.visual.get_dtype()) + video_embeds = self.visual(pixel_values_video, grid_thw=video_grid_thw) + video_mask = (input_ids == self.config.video_token_id) + inputs_embeds[video_mask, :] = video_embeds + input_ids = None + + result = self.model( + input_ids=input_ids, + positions=positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + inputs_embeds=inputs_embeds, + ) + return result + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "up_proj", 1), + ("gate_up_proj", "gate_proj", 0), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if self.config.tie_word_embeddings and "lm_head.weight" in name: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if "visual" in name and "qkv.weight" in name: + visual_num_heads = self.config.vision_config.num_heads + visual_embed_dim = self.config.vision_config.embed_dim + head_size = visual_embed_dim // visual_num_heads + loaded_weight = loaded_weight.view(3, visual_num_heads, head_size, visual_embed_dim) + loaded_weight = loaded_weight.transpose(0, 1) + loaded_weight = loaded_weight.reshape(-1, visual_embed_dim) + elif "visual" in name and "qkv.bias" in name: + visual_num_heads = self.config.vision_config.num_heads + visual_embed_dim = self.config.vision_config.embed_dim + head_size = visual_embed_dim // visual_num_heads + loaded_weight = loaded_weight.view(3, visual_num_heads, head_size) + loaded_weight = loaded_weight.transpose(0, 1) + loaded_weight = loaded_weight.reshape(-1) + param = params_dict[name] + + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index cd16cdcbd890c..745fc715caf45 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -9,6 +9,7 @@ from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs, MultiModalPlugin, MultiModalTokensCalc, NestedTensors) from .image import ImagePlugin +from .video import VideoPlugin logger = init_logger(__name__) @@ -34,7 +35,7 @@ class MultiModalRegistry: :class:`~vllm.multimodal.MultiModalPlugin` for each modality. """ - DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin()) + DEFAULT_PLUGINS = (ImagePlugin(), AudioPlugin(), VideoPlugin()) def __init__( self, diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py new file mode 100644 index 0000000000000..aa316267817c3 --- /dev/null +++ b/vllm/multimodal/video.py @@ -0,0 +1,17 @@ +from vllm.inputs.registry import InputContext +from vllm.multimodal.base import MultiModalInputs, MultiModalPlugin + + +class VideoPlugin(MultiModalPlugin): + """Plugin for video data.""" + + def get_data_key(self) -> str: + return "video" + + def _default_input_mapper(self, ctx: InputContext, + data: object) -> MultiModalInputs: + raise NotImplementedError("There is no default video input mapper") + + def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: + raise NotImplementedError( + "There is no default maximum multimodal tokens") diff --git a/vllm/sequence.py b/vllm/sequence.py index 2fe8ae9d7b270..87db24791a476 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -165,6 +165,9 @@ class SequenceData(msgspec.Struct, # is called. _new_appended_tokens: List[int] = msgspec.field(default_factory=list) + # It is used to compute mrope_position_ids. + _mrope_position_delta: Optional[int] = None + def __post_init__(self) -> None: assert self._prompt_token_ids.typecode == "l" assert self._output_token_ids.typecode == "l" @@ -219,6 +222,14 @@ def output_token_ids_array(self) -> array: assert isinstance(self._output_token_ids, array) return self._output_token_ids + @property + def mrope_position_delta(self) -> Optional[int]: + return self._mrope_position_delta + + @mrope_position_delta.setter + def mrope_position_delta(self, new_mrope_position_delta): + self._mrope_position_delta = new_mrope_position_delta + def append_token_id(self, token_id: int, logprob: float) -> None: self._output_token_ids.append(token_id) self._new_appended_tokens.append(token_id) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 5d930919b8ae5..cc2cd6519e863 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -197,6 +197,7 @@ def __init__( # Input tokens and positions. input_tokens: Optional[List[List[int]]] = None, input_positions: Optional[List[List[int]]] = None, + mrope_input_positions: Optional[List[List[List[int]]]] = None, # The sequence length (may be capped to the sliding window). seq_lens: Optional[List[int]] = None, @@ -257,6 +258,8 @@ def __init__( for seq_id in range(len(self.seq_ids)): self.input_positions[seq_id].clear() + self.mrope_input_positions = None + if seq_lens: self.seq_lens = seq_lens else: @@ -318,6 +321,7 @@ def __init__( else: self.input_tokens = input_tokens or [] self.input_positions = input_positions or [] + self.mrope_input_positions = mrope_input_positions or None self.seq_lens = seq_lens or [] self.orig_seq_lens = orig_seq_lens or [] self.query_lens = query_lens or [] @@ -348,6 +352,7 @@ def __post_init__(self): self.input_tokens = [[] for _ in range(self.n_seqs)] self.input_positions = [[] for _ in range(self.n_seqs)] + self.mrope_input_positions = None self.seq_lens = [0] * self.n_seqs self.orig_seq_lens = [0] * self.n_seqs self.query_lens = [0] * self.n_seqs @@ -484,6 +489,16 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, inter_data.query_lens[ seq_idx] = seq_len - context_len if inter_data.is_prompt else 1 + if seq_data.mrope_position_delta is not None: + if inter_data.mrope_input_positions is None: + inter_data.mrope_input_positions = [None] * inter_data.n_seqs + + from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding + + inter_data.mrope_input_positions[seq_idx] = MRotaryEmbedding.get_next_input_positions( + seq_data.mrope_position_delta, context_len, seq_len, + ) + def _compute_for_prefix_cache_hit( self, inter_data: InterDataForSeqGroup, seq_idx: int, seq_group_metadata: SequenceGroupMetadata): @@ -600,8 +615,40 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, return mm_kwargs = self.multi_modal_input_mapper(mm_data) + inter_data.multi_modal_inputs = mm_kwargs + # special processing for mrope position deltas. + if self.runner.model_is_mrope: + image_grid_thw = mm_kwargs.get("image_grid_thw", None) + video_grid_thw = mm_kwargs.get("video_grid_thw", None) + assert image_grid_thw is not None or video_grid_thw is not None, \ + "mrope embedding type requires multi-modal input mapper returns 'image_grid_thw' or 'video_grid_thw'." + + hf_config = self.runner.model_config.hf_config + + from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding + + inter_data.mrope_input_positions = [None] * inter_data.n_seqs + for seq_idx in range(inter_data.n_seqs): + seq_data = seq_group_metadata.seq_data[inter_data.seq_ids[seq_idx]] + token_ids = seq_data.get_token_ids() + + mrope_input_positions, mrope_position_delta = MRotaryEmbedding.get_input_positions( + token_ids, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + image_token_id=hf_config.image_token_id, + video_token_id=hf_config.video_token_id, + vision_start_token_id=hf_config.vision_start_token_id, + vision_end_token_id=hf_config.vision_end_token_id, + spatial_merge_size=hf_config.vision_config.spatial_merge_size, + context_len=inter_data.context_lens[seq_idx], + ) + + seq_data.mrope_position_delta = mrope_position_delta + inter_data.mrope_input_positions[seq_idx] = mrope_input_positions + def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): """Add a sequence group to the builder.""" seq_ids = seq_group_metadata.seq_data.keys() @@ -650,10 +697,23 @@ def build(self) -> ModelInputForGPU: # prefix caching and there is no decode request. return self.model_input_cls() - input_positions = [] - for inter_data in self.inter_data_list: - for cur_input_positions in inter_data.input_positions: - input_positions.extend(cur_input_positions) + mrope_input_positions = None + if any(inter_data.mrope_input_positions is not None for inter_data in self.inter_data_list): + mrope_input_positions = [[] for _ in range(3)] + for idx in range(3): + for inter_data in self.inter_data_list: + if inter_data.mrope_input_positions is None: + for _seq_input_positions in inter_data.input_positions: + mrope_input_positions[idx].extend(_seq_input_positions) + else: + for _seq_mrope_input_positions in inter_data.mrope_input_positions: + mrope_input_positions[idx].extend(_seq_mrope_input_positions[idx]) + input_positions = None + else: + input_positions = [] + for inter_data in self.inter_data_list: + for cur_input_positions in inter_data.input_positions: + input_positions.extend(cur_input_positions) seq_lens = [] max_decode_seq_len = 0 @@ -690,15 +750,21 @@ def build(self) -> ModelInputForGPU: # Tokens and positions. if cuda_graph_pad_size: input_tokens.extend(itertools.repeat(0, cuda_graph_pad_size)) - input_positions.extend(itertools.repeat(0, cuda_graph_pad_size)) assert self.runner.device is not None input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long, self.runner.device, self.runner.pin_memory) - input_positions_tensor = async_tensor_h2d(input_positions, torch.long, - self.runner.device, - self.runner.pin_memory) - + if mrope_input_positions is not None: + for idx in range(3): + mrope_input_positions[idx].extend(itertools.repeat(0, cuda_graph_pad_size)) + input_positions_tensor = async_tensor_h2d(mrope_input_positions, torch.long, + self.runner.device, + self.runner.pin_memory) + else: + input_positions.extend(itertools.repeat(0, cuda_graph_pad_size)) + input_positions_tensor = async_tensor_h2d(input_positions, torch.long, + self.runner.device, + self.runner.pin_memory) # Sequence and query lengths. if cuda_graph_pad_size: seq_lens.extend(itertools.repeat(1, cuda_graph_pad_size)) @@ -1163,6 +1229,13 @@ def list_prompt_adapters(self) -> Set[int]: raise RuntimeError("PromptAdapter is not enabled.") return self.prompt_adapter_manager.list_adapters() + @property + def model_is_mrope(self) -> bool: + """Detect if the model has "mrope" rope_scaling type. + mrope requires keep "rope_deltas" between prompt and decoding phases.""" + rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {}) + return rope_scaling.get("type", None) == "mrope" + @torch.inference_mode() def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: """Cuda graph capture a model. @@ -1193,7 +1266,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: max_batch_size = max(_BATCH_SIZES_TO_CAPTURE) input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda() input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda() - + if self.model_is_mrope: + input_positions = torch.tile(input_positions, (3, 1)) # Prepare dummy previous_hidden_states only if needed by the model. # This is used by draft models such as EAGLE. previous_hidden_states = None @@ -1258,7 +1332,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: "input_ids": input_tokens[:batch_size], "positions": - input_positions[:batch_size], + input_positions[..., :batch_size], "hidden_or_intermediate_states": hidden_or_intermediate_states[ virtual_engine] # type: ignore From 7f96df80e099e2f6d7977bc1a392374eadcd25cb Mon Sep 17 00:00:00 2001 From: "suyang.fy" Date: Tue, 27 Aug 2024 12:30:02 +0800 Subject: [PATCH 02/34] Reformat --- docs/source/models/supported_models.rst | 5 + .../model_executor/layers/rotary_embedding.py | 61 ++-- vllm/model_executor/models/__init__.py | 6 +- vllm/model_executor/models/qwen2_vl.py | 288 ++++++++++++------ vllm/worker/model_runner.py | 36 ++- 5 files changed, 265 insertions(+), 131 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 223c68b40766e..f77388850304a 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -219,6 +219,11 @@ Multimodal Language Models - Image - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - + * - :code:`Qwen2VLForConditionalGeneration` + - Qwen2-VL + - Image / Video + - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. + - * - :code:`PaliGemmaForConditionalGeneration` - PaliGemma - Image diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 1b454e3cebf3c..b4bd199474804 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -776,9 +776,10 @@ def __init__( base: int, is_neox_style: bool, dtype: torch.dtype, - mrope_section: List[int] = None, + mrope_section: Optional[List[int]] = None, ) -> None: - super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype) + super().__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style, dtype) self.mrope_section = mrope_section if self.mrope_section: @@ -806,8 +807,16 @@ def forward( cos_sin = self.cos_sin_cache[positions] cos, sin = cos_sin.chunk(2, dim=-1) if self.mrope_section and positions.ndim == query.ndim - 1: - cos = torch.cat([m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))], dim=-1) - sin = torch.cat([m[i] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))], dim=-1) + cos = torch.cat([ + m[i] + for i, m in enumerate(cos.split(self.mrope_section, dim=-1)) + ], + dim=-1) + sin = torch.cat([ + m[i] + for i, m in enumerate(sin.split(self.mrope_section, dim=-1)) + ], + dim=-1) if self.is_neox_style: # NOTE(woosuk): Here we assume that the positions tensor has the @@ -857,7 +866,8 @@ def get_input_positions( video_grid_thw = video_grid_thw.tolist() input_tokens_tensor = torch.tensor(input_tokens) - vision_start_indices = torch.argwhere(input_tokens_tensor == vision_start_token_id).squeeze(1) + vision_start_indices = torch.argwhere( + input_tokens_tensor == vision_start_token_id).squeeze(1) vision_tokens = input_tokens_tensor[vision_start_indices + 1] image_nums = (vision_tokens == image_token_id).sum() video_nums = (vision_tokens == video_token_id).sum() @@ -897,23 +907,32 @@ def get_input_positions( llm_grid_t, llm_grid_h, llm_grid_w = t, h // spatial_merge_size, w // spatial_merge_size text_len = ed - st - st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 - llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) - - t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten() - h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten() - w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten() - llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx) + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + t_index = torch.arange(llm_grid_t).view(-1, 1).expand( + -1, llm_grid_h * llm_grid_w).flatten() + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand( + llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand( + llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + text_len + st_idx) st = ed + llm_grid_t * llm_grid_h * llm_grid_w if st < len(input_tokens): - st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 text_len = len(input_tokens) - st - llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) llm_positions = llm_positions[:, context_len:] - mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() + mrope_position_delta = (llm_positions.max() + 1 - + len(input_tokens)).item() return llm_positions.tolist(), mrope_position_delta @@ -924,8 +943,9 @@ def get_next_input_positions( seq_len: int, ) -> List[List[int]]: return [ - list(range(context_len + mrope_position_delta, seq_len + mrope_position_delta)) - for _ in range(3) + list( + range(context_len + mrope_position_delta, + seq_len + mrope_position_delta)) for _ in range(3) ] @@ -1035,7 +1055,12 @@ def get_rope( **extra_kwargs) elif scaling_type == "mrope": return MRotaryEmbedding( - head_size, rotary_dim, max_position, base, is_neox_style, dtype, + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, mrope_section=rope_scaling["mrope_section"], ) else: diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 28665e8f51db1..5115c0708b18a 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -52,7 +52,8 @@ "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), - "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), + "Qwen2VLForConditionalGeneration": + ("qwen2_vl", "Qwen2VLForConditionalGeneration"), "RWForCausalLM": ("falcon", "FalconForCausalLM"), "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"), "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"), @@ -86,7 +87,8 @@ "PaliGemmaForConditionalGeneration"), "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "UltravoxModel": ("ultravox", "UltravoxModel"), - "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), + "Qwen2VLForConditionalGeneration": ("qwen2_vl", + "Qwen2VLForConditionalGeneration"), } _CONDITIONAL_GENERATION_MODELS = { "BartModel": ("bart", "BartForConditionalGeneration"), diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 3bf3ad1c012df..8459ea4cc0891 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -22,11 +22,11 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" -import math + from array import array from collections.abc import Mapping from functools import partial, lru_cache -from typing import Tuple, Optional, List, Iterable, Any, Dict, Type +from typing import Tuple, Optional, List, Iterable, Type import torch import torch.nn as nn @@ -46,9 +46,8 @@ from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.linear import ( - ColumnParallelLinear, - RowParallelLinear) +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import Sampler @@ -60,22 +59,20 @@ from vllm.multimodal.base import MultiModalData from vllm.multimodal.image import cached_get_image_processor from vllm.sequence import SequenceData, SamplerOutput, IntermediateTensors, VLLM_TOKEN_ID_ARRAY_TYPE -from vllm.utils import is_list_of logger = init_logger(__name__) - # === Vision Encoder === # -def quick_gelu(x: torch.Tensor, inplace: bool = False) -> torch.Tensor: +def quick_gelu(x: torch.Tensor) -> torch.Tensor: return x * torch.sigmoid(1.702 * x) class QuickGELU(nn.Module): - """Applies the Gaussian Error Linear Units function (w/ dummy inplace arg)""" + """Applies the Gaussian Error Linear Units function.""" - def __init__(self, inplace: bool = False) -> None: + def __init__(self) -> None: super(QuickGELU, self).__init__() def forward(self, input: torch.Tensor) -> torch.Tensor: @@ -83,6 +80,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: class Qwen2VisionMLP(nn.Module): + def __init__( self, in_features: int, @@ -91,9 +89,13 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, ): super().__init__() - self.fc1 = ColumnParallelLinear(in_features, hidden_features, quant_config=quant_config) + self.fc1 = ColumnParallelLinear(in_features, + hidden_features, + quant_config=quant_config) self.act = act_layer() - self.fc2 = RowParallelLinear(hidden_features, in_features, quant_config=quant_config) + self.fc2 = RowParallelLinear(hidden_features, + in_features, + quant_config=quant_config) def forward(self, x: torch.Tensor) -> torch.Tensor: x_parallel, _ = self.fc1(x) @@ -108,7 +110,9 @@ def rotate_half(x, interleaved=False): return torch.cat((-x2, x1), dim=-1) else: x1, x2 = x[..., ::2], x[..., 1::2] - return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2) + return rearrange(torch.stack((-x2, x1), dim=-1), + "... d two -> ... (d two)", + two=2) def apply_rotary_emb_torch(x, cos, sin, interleaved=False): @@ -118,15 +122,23 @@ def apply_rotary_emb_torch(x, cos, sin, interleaved=False): """ ro_dim = cos.shape[-1] * 2 assert ro_dim <= x.shape[-1] - cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") - sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") + cos = repeat( + cos, + "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") + sin = repeat( + sin, + "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") return torch.cat( - [x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]], + [ + x[..., :ro_dim] * cos + + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:] + ], dim=-1, ) -def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: +def apply_rotary_pos_emb_vision(t: torch.Tensor, + freqs: torch.Tensor) -> torch.Tensor: t_ = t.float() cos = freqs.cos() sin = freqs.sin() @@ -135,6 +147,7 @@ def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.T class Qwen2VisionAttention(nn.Module): + def __init__( self, embed_dim: Optional[int] = None, @@ -145,12 +158,17 @@ def __init__( super().__init__() # Per attention head and per partition values. world_size = parallel_state.get_tensor_model_parallel_world_size() - self.hidden_size_per_attention_head = dist_utils.divide(projection_size, num_heads) - self.num_attention_heads_per_partition = dist_utils.divide(num_heads, world_size) - - self.qkv = ColumnParallelLinear( - input_size=embed_dim, output_size=3 * projection_size, quant_config=quant_config) - self.proj = RowParallelLinear(input_size=projection_size, output_size=embed_dim, quant_config=quant_config) + self.hidden_size_per_attention_head = dist_utils.divide( + projection_size, num_heads) + self.num_attention_heads_per_partition = dist_utils.divide( + num_heads, world_size) + + self.qkv = ColumnParallelLinear(input_size=embed_dim, + output_size=3 * projection_size, + quant_config=quant_config) + self.proj = RowParallelLinear(input_size=projection_size, + output_size=embed_dim, + quant_config=quant_config) def forward( self, @@ -172,25 +190,35 @@ def forward( q, k, v = dist_utils.split_tensor_along_last_dim(x, 3) batch_size = q.shape[1] - q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous() for x in (q, k, v)] + q, k, v = [ + rearrange(x, 's b ... -> b s ...').contiguous() for x in (q, k, v) + ] if rotary_pos_emb is not None: q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]] max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() - output = flash_attn_varlen_func( - q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen, 0, causal=False - ) + output = flash_attn_varlen_func(q, + k, + v, + cu_seqlens, + cu_seqlens, + max_seqlen, + max_seqlen, + 0, + causal=False) context_layer = rearrange(output, '(b s) ... -> b s ...', b=batch_size) - context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous() + context_layer = rearrange(context_layer, + 'b s h d -> s b (h d)').contiguous() output, _ = self.proj(context_layer) return output class Qwen2VisionBlock(nn.Module): + def __init__( self, dim: int, @@ -205,21 +233,25 @@ def __init__( self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) - self.attn = Qwen2VisionAttention( - embed_dim=dim, - num_heads=num_heads, - projection_size=dim, - quant_config=quant_config - ) - self.mlp = Qwen2VisionMLP(dim, mlp_hidden_dim, act_layer=act_layer, quant_config=quant_config) + self.attn = Qwen2VisionAttention(embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config) + self.mlp = Qwen2VisionMLP(dim, + mlp_hidden_dim, + act_layer=act_layer, + quant_config=quant_config) def forward(self, x, cu_seqlens, rotary_pos_emb) -> torch.Tensor: - x = x + self.attn(self.norm1(x), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb) + x = x + self.attn(self.norm1(x), + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb) x = x + self.mlp(self.norm2(x)) return x class Qwen2VisionPatchEmbed(nn.Module): + def __init__( self, patch_size: int = 14, @@ -233,18 +265,22 @@ def __init__( self.embed_dim = embed_dim kernel_size = [temporal_patch_size, patch_size, patch_size] - self.proj = nn.Conv3d( - in_chans, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False - ) + self.proj = nn.Conv3d(in_chans, + embed_dim, + kernel_size=kernel_size, + stride=kernel_size, + bias=False) def forward(self, x: torch.Tensor) -> torch.Tensor: L, C = x.shape - x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size) + x = x.view(L, -1, self.temporal_patch_size, self.patch_size, + self.patch_size) x = self.proj(x).view(L, self.embed_dim) return x class Qwen2VisionPatchMerger(nn.Module): + def __init__( self, d_model: int, @@ -254,12 +290,18 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() - self.hidden_size = context_dim * (spatial_merge_size ** 2) + self.hidden_size = context_dim * (spatial_merge_size**2) self.ln_q = norm_layer(context_dim) self.mlp = nn.ModuleList([ - ColumnParallelLinear(self.hidden_size, self.hidden_size, bias=True, quant_config=quant_config), + ColumnParallelLinear(self.hidden_size, + self.hidden_size, + bias=True, + quant_config=quant_config), nn.GELU(), - RowParallelLinear(self.hidden_size, d_model, bias=True, quant_config=quant_config), + RowParallelLinear(self.hidden_size, + d_model, + bias=True, + quant_config=quant_config), ]) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -274,11 +316,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class Qwen2VisionRotaryEmbedding(nn.Module): + def __init__(self, dim: int, theta: float = 10000.0) -> None: super().__init__() self.dim = dim self.theta = theta - inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim)) + inv_freq = 1.0 / (theta + **(torch.arange(0, dim, 2, dtype=torch.float) / dim)) self.register_buffer("inv_freq", inv_freq, persistent=False) self._seq_len_cached = 0 self._freqs_cached = None @@ -287,14 +331,12 @@ def update_freqs_cache(self, seqlen: int) -> None: if seqlen > self._seq_len_cached: seqlen *= 2 self._seq_len_cached = seqlen - self.inv_freq = 1.0 / ( - self.theta - ** ( - torch.arange(0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device) - / self.dim - ) - ) - seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype) + self.inv_freq = 1.0 / (self.theta**(torch.arange( + 0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device) + / self.dim)) + seq = torch.arange(seqlen, + device=self.inv_freq.device, + dtype=self.inv_freq.dtype) freqs = torch.outer(seq, self.inv_freq) self._freqs_cached = freqs @@ -304,6 +346,7 @@ def forward(self, seqlen: int) -> torch.Tensor: class Qwen2VisionTransformer(nn.Module): + def __init__( self, vision_config: Qwen2VLVisionConfig, @@ -335,18 +378,15 @@ def __init__( head_dim = embed_dim // num_heads self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2) - self.blocks = nn.ModuleList( - [ - Qwen2VisionBlock( - dim=embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - norm_layer=norm_layer, - quant_config=quant_config, - ) - for _ in range(depth) - ] - ) + self.blocks = nn.ModuleList([ + Qwen2VisionBlock( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + norm_layer=norm_layer, + quant_config=quant_config, + ) for _ in range(depth) + ]) self.merger = Qwen2VisionPatchMerger( d_model=hidden_size, context_dim=embed_dim, @@ -366,14 +406,19 @@ def rot_pos_emb(self, grid_thw): hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) hpos_ids = hpos_ids.reshape( - h // self.spatial_merge_size, self.spatial_merge_size, - w // self.spatial_merge_size, self.spatial_merge_size, + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, ).permute(0, 2, 1, 3).flatten() wpos_ids = wpos_ids.reshape( - h // self.spatial_merge_size, self.spatial_merge_size, - w // self.spatial_merge_size, self.spatial_merge_size, + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, ).permute(0, 2, 1, 3).flatten() - pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids.append( + torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) pos_ids = torch.cat(pos_ids, dim=0) max_grid_size = grid_thw[:, 1:].max() rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) @@ -393,9 +438,9 @@ def forward( rotary_pos_emb = self.rot_pos_emb(grid_thw) # compute cu_seqlens - cu_seqlens = torch.repeat_interleave( - grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] - ).cumsum(dim=0, dtype=torch.int32) + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], + grid_thw[:, 0]).cumsum( + dim=0, dtype=torch.int32) cu_seqlens = F.pad(cu_seqlens, (1, 0), 'constant', 0) # transformers @@ -486,13 +531,18 @@ def mm_input_mapper_for_qwen2_vl( return MultiModalInputs(batch_data) -image_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl, data_type_key="image") -video_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl, data_type_key="video") +image_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl, + data_type_key="image") +video_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl, + data_type_key="video") -def _get_max_image_info(image_processor, data_type_key: str = 'image', mm_count: int = 1): +def _get_max_image_info(image_processor, + data_type_key: str = 'image', + mm_count: int = 1): max_resized_height, max_resized_width = smart_resize( - height=9999999, width=9999999, + height=9999999, + width=9999999, factor=image_processor.patch_size * image_processor.merge_size, # Limit min / max pixels. @@ -521,8 +571,10 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext, data_type_key: str) -> int: return max_llm_image_tokens -get_max_qwen2_vl_image_tokens = partial(get_max_qwen2_vl_mm_tokens, data_type_key="image") -get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens, data_type_key="video") +get_max_qwen2_vl_image_tokens = partial(get_max_qwen2_vl_mm_tokens, + data_type_key="image") +get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens, + data_type_key="video") def dummy_data_for_qwen2_vl( @@ -536,7 +588,8 @@ def dummy_data_for_qwen2_vl( if seq_len - max_llm_image_tokens - 2 < 0: raise RuntimeError( f'Qwen2-VL cannot process {num_images} images in a prompt, ' - f'please increase max_model_len or reduce image limit by --limit-mm-per-prompt.') + f'please increase max_model_len or reduce image limit by --limit-mm-per-prompt.' + ) # Check video counts. num_videos = mm_counts["video"] @@ -545,20 +598,29 @@ def dummy_data_for_qwen2_vl( if seq_len - max_llm_video_tokens - 2 < 0: raise RuntimeError( f'Qwen2-VL cannot process {num_images} videos in a prompt, ' - f'please increase max_model_len or reduce video limit by --limit-mm-per-prompt.') + f'please increase max_model_len or reduce video limit by --limit-mm-per-prompt.' + ) hf_config = ctx.get_hf_config(Qwen2VLConfig) - token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [hf_config.vision_start_token_id]) - token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, [hf_config.image_token_id]) * max_llm_image_tokens - token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, [hf_config.vision_end_token_id]) - token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, [0]) * (seq_len - max_llm_image_tokens - 2) + token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, + [hf_config.vision_start_token_id]) + token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, + [hf_config.image_token_id]) * max_llm_image_tokens + token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, + [hf_config.vision_end_token_id]) + token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, + [0]) * (seq_len - max_llm_image_tokens - 2) dummy_seqdata = SequenceData(token_ids) - dummy_image = Image.new("RGB", (max_resized_width, max_resized_height), color=0) + dummy_image = Image.new("RGB", (max_resized_width, max_resized_height), + color=0) - return dummy_seqdata, {"image": dummy_image if num_images == 1 else [dummy_image] * num_images} + return dummy_seqdata, { + "image": dummy_image if num_images == 1 else [dummy_image] * num_images + } -def input_processor_for_qwen2_vl(ctx: InputContext, llm_inputs: LLMInputs) -> LLMInputs: +def input_processor_for_qwen2_vl(ctx: InputContext, + llm_inputs: LLMInputs) -> LLMInputs: multi_modal_data = llm_inputs.get("multi_modal_data", None) if multi_modal_data is None: return llm_inputs @@ -567,7 +629,11 @@ def input_processor_for_qwen2_vl(ctx: InputContext, llm_inputs: LLMInputs) -> LL video_inputs = multi_modal_data.get("video", None) processor = cached_get_processor(ctx.model_config.model) - inputs = processor(text=[llm_inputs['prompt']], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt") + inputs = processor(text=[llm_inputs['prompt']], + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt") return LLMInputs( prompt_token_ids=inputs['input_ids'][0].tolist(), @@ -576,13 +642,17 @@ def input_processor_for_qwen2_vl(ctx: InputContext, llm_inputs: LLMInputs) -> LL ) -@MULTIMODAL_REGISTRY.register_image_input_mapper(image_input_mapper_for_qwen2_vl) -@MULTIMODAL_REGISTRY.register_input_mapper("video", video_input_mapper_for_qwen2_vl) +@MULTIMODAL_REGISTRY.register_image_input_mapper( + image_input_mapper_for_qwen2_vl) +@MULTIMODAL_REGISTRY.register_input_mapper("video", + video_input_mapper_for_qwen2_vl) @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_vl_image_tokens) -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens("video", get_max_qwen2_vl_video_tokens) +@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( + "video", get_max_qwen2_vl_video_tokens) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_vl) @INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_vl) class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal): + def __init__(self, config: Qwen2VLConfig, multimodal_config: MultiModalConfig, @@ -620,9 +690,21 @@ def forward( intermediate_tensors: Optional[IntermediateTensors] = None, **kwargs: object, ) -> SamplerOutput: + """Run forward pass for Qwen2-VL. + + Args: + input_ids: Flattened (concatenated) input_ids corresponding to a batch. + positions: Flattened (concatenated) position ids corresponding to a batch. + NOTE: If mrope is enabled (default setting for Qwen2-VL opensource models), the shape will be `(3, seq_len)`, otherwise it will be `(seq_len,). + pixel_values: Pixel values to be fed to a model. `None` if no images are passed. + image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. `None` if no images are passed. + pixel_values_video: Pixel values of videos to be fed to a model. `None` if no videos are passed. + video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. `None` if no videos are passed. + """ pixel_values: torch.Tensor = kwargs.get('pixel_values', None) image_grid_thw: torch.Tensor = kwargs.get('image_grid_thw', None) - pixel_values_video: torch.Tensor = kwargs.get('pixel_values_video', None) + pixel_values_video: torch.Tensor = kwargs.get('pixel_values_video', + None) video_grid_thw: torch.Tensor = kwargs.get('video_grid_thw', None) no_vision = pixel_values is None and pixel_values_video is None @@ -632,18 +714,22 @@ def forward( else: inputs_embeds = self.model.embed_tokens(input_ids) - if getattr(self.config, "rope_scaling", {}).get("type", None) == "mrope": + if getattr(self.config, "rope_scaling", {}).get("type", + None) == "mrope": assert positions.ndim == 2 and positions.size(0) == 3, \ f"multimodal section rotary embedding requires (3, seq_len) positions, but got {positions.size()}" if pixel_values is not None: pixel_values = pixel_values.type(self.visual.get_dtype()) - image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw) + image_embeds = self.visual(pixel_values, + grid_thw=image_grid_thw) image_mask = (input_ids == self.config.image_token_id) inputs_embeds[image_mask, :] = image_embeds if pixel_values_video is not None: - pixel_values_video = pixel_values_video.type(self.visual.get_dtype()) - video_embeds = self.visual(pixel_values_video, grid_thw=video_grid_thw) + pixel_values_video = pixel_values_video.type( + self.visual.get_dtype()) + video_embeds = self.visual(pixel_values_video, + grid_thw=video_grid_thw) video_mask = (input_ids == self.config.video_token_id) inputs_embeds[video_mask, :] = video_embeds input_ids = None @@ -699,17 +785,21 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): visual_num_heads = self.config.vision_config.num_heads visual_embed_dim = self.config.vision_config.embed_dim head_size = visual_embed_dim // visual_num_heads - loaded_weight = loaded_weight.view(3, visual_num_heads, head_size, visual_embed_dim) + loaded_weight = loaded_weight.view(3, visual_num_heads, + head_size, + visual_embed_dim) loaded_weight = loaded_weight.transpose(0, 1) loaded_weight = loaded_weight.reshape(-1, visual_embed_dim) elif "visual" in name and "qkv.bias" in name: visual_num_heads = self.config.vision_config.num_heads visual_embed_dim = self.config.vision_config.embed_dim head_size = visual_embed_dim // visual_num_heads - loaded_weight = loaded_weight.view(3, visual_num_heads, head_size) + loaded_weight = loaded_weight.view(3, visual_num_heads, + head_size) loaded_weight = loaded_weight.transpose(0, 1) loaded_weight = loaded_weight.reshape(-1) param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) weight_loader(param, loaded_weight) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index cc2cd6519e863..5464280fc027a 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -495,9 +495,12 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding - inter_data.mrope_input_positions[seq_idx] = MRotaryEmbedding.get_next_input_positions( - seq_data.mrope_position_delta, context_len, seq_len, - ) + inter_data.mrope_input_positions[ + seq_idx] = MRotaryEmbedding.get_next_input_positions( + seq_data.mrope_position_delta, + context_len, + seq_len, + ) def _compute_for_prefix_cache_hit( self, inter_data: InterDataForSeqGroup, seq_idx: int, @@ -631,7 +634,8 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, inter_data.mrope_input_positions = [None] * inter_data.n_seqs for seq_idx in range(inter_data.n_seqs): - seq_data = seq_group_metadata.seq_data[inter_data.seq_ids[seq_idx]] + seq_data = seq_group_metadata.seq_data[ + inter_data.seq_ids[seq_idx]] token_ids = seq_data.get_token_ids() mrope_input_positions, mrope_position_delta = MRotaryEmbedding.get_input_positions( @@ -642,12 +646,14 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, video_token_id=hf_config.video_token_id, vision_start_token_id=hf_config.vision_start_token_id, vision_end_token_id=hf_config.vision_end_token_id, - spatial_merge_size=hf_config.vision_config.spatial_merge_size, + spatial_merge_size=hf_config.vision_config. + spatial_merge_size, context_len=inter_data.context_lens[seq_idx], ) seq_data.mrope_position_delta = mrope_position_delta - inter_data.mrope_input_positions[seq_idx] = mrope_input_positions + inter_data.mrope_input_positions[ + seq_idx] = mrope_input_positions def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): """Add a sequence group to the builder.""" @@ -698,16 +704,19 @@ def build(self) -> ModelInputForGPU: return self.model_input_cls() mrope_input_positions = None - if any(inter_data.mrope_input_positions is not None for inter_data in self.inter_data_list): + if any(inter_data.mrope_input_positions is not None + for inter_data in self.inter_data_list): mrope_input_positions = [[] for _ in range(3)] for idx in range(3): for inter_data in self.inter_data_list: if inter_data.mrope_input_positions is None: for _seq_input_positions in inter_data.input_positions: - mrope_input_positions[idx].extend(_seq_input_positions) + mrope_input_positions[idx].extend( + _seq_input_positions) else: for _seq_mrope_input_positions in inter_data.mrope_input_positions: - mrope_input_positions[idx].extend(_seq_mrope_input_positions[idx]) + mrope_input_positions[idx].extend( + _seq_mrope_input_positions[idx]) input_positions = None else: input_positions = [] @@ -756,13 +765,16 @@ def build(self) -> ModelInputForGPU: self.runner.pin_memory) if mrope_input_positions is not None: for idx in range(3): - mrope_input_positions[idx].extend(itertools.repeat(0, cuda_graph_pad_size)) - input_positions_tensor = async_tensor_h2d(mrope_input_positions, torch.long, + mrope_input_positions[idx].extend( + itertools.repeat(0, cuda_graph_pad_size)) + input_positions_tensor = async_tensor_h2d(mrope_input_positions, + torch.long, self.runner.device, self.runner.pin_memory) else: input_positions.extend(itertools.repeat(0, cuda_graph_pad_size)) - input_positions_tensor = async_tensor_h2d(input_positions, torch.long, + input_positions_tensor = async_tensor_h2d(input_positions, + torch.long, self.runner.device, self.runner.pin_memory) # Sequence and query lengths. From bcaff4fa0810fef8f9ed46e77543fec7ea3fcdfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Tue, 27 Aug 2024 17:30:08 +0800 Subject: [PATCH 03/34] Update transformers link. --- vllm/model_executor/models/qwen2_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 8459ea4cc0891..5286eb6fca9af 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1,6 +1,6 @@ # coding=utf-8 # Adapted from -# TODO: link to transformers modeling file +# https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. From f2185bf918b38ba817ead4047709498f05315dd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Tue, 27 Aug 2024 18:55:47 +0800 Subject: [PATCH 04/34] Bugfix of mrope_input_positions in model_runner.py. --- vllm/worker/model_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index b245c9a24dfa0..d846a32265231 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -173,6 +173,7 @@ class InterDataForSeqGroup: def simple_reinit(self): self.input_tokens[0].clear() # type: ignore self.input_positions[0].clear() # type: ignore + self.mrope_input_positions = None # type: ignore self.seq_lens[0] = 0 # type: ignore self.orig_seq_lens[0] = 0 # type: ignore self.query_lens[0] = 0 # type: ignore From 60448cb03bd1bc6d763d6fb8ec976af2912da454 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Tue, 27 Aug 2024 20:27:59 +0800 Subject: [PATCH 05/34] Rename pixel_values_video to pixel_values_videos in qwen2_vl.py. Add Qwen2-VL support in chat_utils.py. --- vllm/entrypoints/chat_utils.py | 6 ++++++ vllm/model_executor/models/qwen2_vl.py | 12 ++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 19d1095084293..34cfc22cb96ea 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -131,12 +131,18 @@ def _mm_token_str(model_config: ModelConfig, tokenizer: AnyTokenizer, return tokenizer.decode(model_config.hf_config.image_token_index) if model_type in ("chameleon", "internvl_chat"): return "" + if model_type == "qwen2_vl": + return "<|vision_start|><|image_pad|><|vision_end|>" raise TypeError(f"Unknown model type: {model_type}") elif modality == "audio": if model_type == "ultravox": return "<|reserved_special_token_0|>" raise TypeError(f"Unknown model type: {model_type}") + elif modality == "video": + if model_type == "qwen2_vl": + return "<|vision_start|><|video_pad|><|vision_end|>" + raise TypeError(f"Unknown model type: {model_type}") else: raise TypeError(f"Unknown modality: {modality}") diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 5286eb6fca9af..e762d07bdb107 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -698,16 +698,16 @@ def forward( NOTE: If mrope is enabled (default setting for Qwen2-VL opensource models), the shape will be `(3, seq_len)`, otherwise it will be `(seq_len,). pixel_values: Pixel values to be fed to a model. `None` if no images are passed. image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. `None` if no images are passed. - pixel_values_video: Pixel values of videos to be fed to a model. `None` if no videos are passed. + pixel_values_videos: Pixel values of videos to be fed to a model. `None` if no videos are passed. video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. `None` if no videos are passed. """ pixel_values: torch.Tensor = kwargs.get('pixel_values', None) image_grid_thw: torch.Tensor = kwargs.get('image_grid_thw', None) - pixel_values_video: torch.Tensor = kwargs.get('pixel_values_video', + pixel_values_videos: torch.Tensor = kwargs.get('pixel_values_videos', None) video_grid_thw: torch.Tensor = kwargs.get('video_grid_thw', None) - no_vision = pixel_values is None and pixel_values_video is None + no_vision = pixel_values is None and pixel_values_videos is None if no_vision: inputs_embeds = None @@ -725,10 +725,10 @@ def forward( grid_thw=image_grid_thw) image_mask = (input_ids == self.config.image_token_id) inputs_embeds[image_mask, :] = image_embeds - if pixel_values_video is not None: - pixel_values_video = pixel_values_video.type( + if pixel_values_videos is not None: + pixel_values_videos = pixel_values_videos.type( self.visual.get_dtype()) - video_embeds = self.visual(pixel_values_video, + video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw) video_mask = (input_ids == self.config.video_token_id) inputs_embeds[video_mask, :] = video_embeds From 71a77b1b996cc10a7a9cf27310c6722dd9c84ec7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Tue, 27 Aug 2024 20:56:45 +0800 Subject: [PATCH 06/34] Fix the bug of MultiModalInputs.batch() when passing different modalities in a single batch. --- vllm/model_executor/models/qwen2_vl.py | 4 ++++ vllm/multimodal/base.py | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index e762d07bdb107..394256825f423 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -528,6 +528,10 @@ def mm_input_mapper_for_qwen2_vl( logger.error("Failed to process image (%s)", data) raise + # Ensure different modalities will return a batch_data with same keys, avoid error in `MultiModalInputs.batch()`. + for key in ['pixel_values', 'image_grid_thw', 'pixel_values_videos', 'video_grid_thw']: + batch_data.setdefault(key, None) + return MultiModalInputs(batch_data) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 8ada60c8fd6ae..8ef30b705db4a 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -99,7 +99,11 @@ def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs: raise ValueError(msg) for k, v in inputs.items(): - item_lists[k].append(v) + # For models that supports multiple modalities (e.g. Qwen2-VL), + # input mapper will set values of unused modality keys to None, + # and batching procedure should skip them. + if v is not None: + item_lists[k].append(v) return { k: MultiModalInputs._try_concat(item_list) From 60c4cbd26820a5913e062095830ac890e8be4c0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Tue, 27 Aug 2024 21:29:07 +0800 Subject: [PATCH 07/34] Fix the bug when running OpenAI-compatible API server. --- vllm/model_executor/models/qwen2_vl.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 394256825f423..37a02859d8520 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -633,7 +633,13 @@ def input_processor_for_qwen2_vl(ctx: InputContext, video_inputs = multi_modal_data.get("video", None) processor = cached_get_processor(ctx.model_config.model) - inputs = processor(text=[llm_inputs['prompt']], + + prompt = llm_inputs['prompt'] + if prompt is None: + prompt_token_ids = llm_inputs['prompt_token_ids'] + prompt = processor.tokenizer.decode(prompt_token_ids) + + inputs = processor(text=[prompt], images=image_inputs, videos=video_inputs, padding=True, From ddb71389e84478f6f23ca1f7c31452f8cb2a8353 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Thu, 29 Aug 2024 14:57:56 +0800 Subject: [PATCH 08/34] Refactor qwen2_vl.py based on review comments. --- vllm/model_executor/models/qwen2_vl.py | 214 +++++++++++++++---------- vllm/multimodal/base.py | 14 +- vllm/transformers_utils/processor.py | 37 +++++ 3 files changed, 171 insertions(+), 94 deletions(-) create mode 100644 vllm/transformers_utils/processor.py diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 37a02859d8520..85bdec3e65e00 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -26,7 +26,7 @@ from array import array from collections.abc import Mapping from functools import partial, lru_cache -from typing import Tuple, Optional, List, Iterable, Type +from typing import Tuple, Optional, List, Iterable, Type, TypedDict import torch import torch.nn as nn @@ -46,6 +46,7 @@ from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.activation import QuickGELU from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -59,24 +60,36 @@ from vllm.multimodal.base import MultiModalData from vllm.multimodal.image import cached_get_image_processor from vllm.sequence import SequenceData, SamplerOutput, IntermediateTensors, VLLM_TOKEN_ID_ARRAY_TYPE +from vllm.transformers_utils.processor import get_processor logger = init_logger(__name__) -# === Vision Encoder === # +# === Vision Inputs === # + + +class Qwen2VLImageInputs(TypedDict): + pixel_values: torch.Tensor + """Shape: `(num_patches, num_channels * patch_size * patch_size)`""" -def quick_gelu(x: torch.Tensor) -> torch.Tensor: - return x * torch.sigmoid(1.702 * x) + image_grid_thw: torch.Tensor + """Shape: `(num_images, 3)` + + This should be in `(grid_t, grid_h, grid_w)` format. + """ -class QuickGELU(nn.Module): - """Applies the Gaussian Error Linear Units function.""" +class Qwen2VLVideoInputs(TypedDict): + pixel_values_videos: torch.Tensor + """Shape: `(num_patches, num_channels * temporal_patch_size * patch_size * patch_size)`""" - def __init__(self) -> None: - super(QuickGELU, self).__init__() + video_grid_thw: torch.Tensor + """Shape: `(num_videos, 3)` + + This should be in `(grid_t, grid_h, grid_w)` format. + """ - def forward(self, input: torch.Tensor) -> torch.Tensor: - return quick_gelu(input) +# === Vision Encoder === # class Qwen2VisionMLP(nn.Module): @@ -104,7 +117,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -def rotate_half(x, interleaved=False): +def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor: if not interleaved: x1, x2 = x.chunk(2, dim=-1) return torch.cat((-x2, x1), dim=-1) @@ -115,7 +128,12 @@ def rotate_half(x, interleaved=False): two=2) -def apply_rotary_emb_torch(x, cos, sin, interleaved=False): +def apply_rotary_emb_torch( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + interleaved: bool = False +) -> torch.Tensor: """ x: (batch_size, seqlen, nheads, headdim) cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2) @@ -137,8 +155,7 @@ def apply_rotary_emb_torch(x, cos, sin, interleaved=False): ) -def apply_rotary_pos_emb_vision(t: torch.Tensor, - freqs: torch.Tensor) -> torch.Tensor: +def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: t_ = t.float() cos = freqs.cos() sin = freqs.sin() @@ -394,10 +411,12 @@ def __init__( quant_config=quant_config, ) - def get_dtype(self) -> torch.dtype: + @property + def dtype(self) -> torch.dtype: return self.blocks[0].mlp.fc2.weight.dtype - def get_device(self) -> torch.device: + @property + def device(self) -> torch.device: return self.blocks[0].mlp.fc2.weight.device def rot_pos_emb(self, grid_thw): @@ -431,7 +450,7 @@ def forward( grid_thw: torch.Tensor, ) -> torch.Tensor: # patchify - x = x.to(device=self.get_device(), dtype=self.get_dtype()) + x = x.to(device=self.device, dtype=self.dtype) x = self.patch_embed(x) # compute position embedding @@ -455,45 +474,6 @@ def forward( # === Vision input helpers === # - -def get_processor( - processor_name: str, - *args, - trust_remote_code: bool = False, - **kwargs, -): - """Gets a processor for the given model name via HuggingFace. - - Derived from `vllm.transformers_utils.image_processor.get_image_processor`. - """ - # don't put this import at the top level - # it will call torch.cuda.device_count() - from transformers import AutoProcessor - - try: - processor = AutoProcessor.from_pretrained( - processor_name, - *args, - trust_remote_code=trust_remote_code, - **kwargs) - except ValueError as e: - # If the error pertains to the processor class not existing or not - # currently being imported, suggest using the --trust-remote-code flag. - # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors - if not trust_remote_code: - err_msg = ( - "Failed to load the processor. If the processor is " - "a custom processor not yet available in the HuggingFace " - "transformers library, consider setting " - "`trust_remote_code=True` in LLM or using the " - "`--trust-remote-code` flag in the CLI.") - raise RuntimeError(err_msg) from e - else: - raise e - - return processor - - cached_get_processor = lru_cache(get_processor) MAX_TEMPORAL_IMAGE_NUM = 10 @@ -528,10 +508,6 @@ def mm_input_mapper_for_qwen2_vl( logger.error("Failed to process image (%s)", data) raise - # Ensure different modalities will return a batch_data with same keys, avoid error in `MultiModalInputs.batch()`. - for key in ['pixel_values', 'image_grid_thw', 'pixel_values_videos', 'video_grid_thw']: - batch_data.setdefault(key, None) - return MultiModalInputs(batch_data) @@ -652,6 +628,17 @@ def input_processor_for_qwen2_vl(ctx: InputContext, ) +def merge_multimodal_embeddings_for_qwen2_vl( + input_ids: torch.Tensor, + inputs_embeds: torch.Tensor, + multimodal_embeddings: torch.Tensor, + placeholder_token_id: int +) -> torch.Tensor: + mask = (input_ids == placeholder_token_id) + inputs_embeds[mask, :] = multimodal_embeddings + return inputs_embeds + + @MULTIMODAL_REGISTRY.register_image_input_mapper( image_input_mapper_for_qwen2_vl) @MULTIMODAL_REGISTRY.register_input_mapper("video", @@ -691,6 +678,70 @@ def __init__(self, self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = Sampler() + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Qwen2VLImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_grid_thw = kwargs.pop("image_grid_thw", None) + + if pixel_values is None: + return None + + if not isinstance(pixel_values, torch.Tensor): + raise ValueError("Incorrect type of image pixel values. " + f"Got type: {type(pixel_values)}") + + if not isinstance(image_grid_thw, torch.Tensor): + raise ValueError("Incorrect type of image grid_thw. " + f"Got type: {type(image_grid_thw)}") + + return Qwen2VLImageInputs( + pixel_values=pixel_values, + image_grid_thw=image_grid_thw + ) + + def _parse_and_validate_video_input( + self, **kwargs: object) -> Optional[Qwen2VLVideoInputs]: + pixel_values_videos = kwargs.pop("pixel_values_videos", None) + video_grid_thw = kwargs.pop("video_grid_thw", None) + + if pixel_values_videos is None: + return None + + if not isinstance(pixel_values_videos, torch.Tensor): + raise ValueError("Incorrect type of video pixel values. " + f"Got type: {type(pixel_values_videos)}") + + if not isinstance(video_grid_thw, torch.Tensor): + raise ValueError("Incorrect type of video grid_thw. " + f"Got type: {type(video_grid_thw)}") + + return Qwen2VLVideoInputs( + pixel_values_videos=pixel_values_videos, + video_grid_thw=video_grid_thw, + ) + + def _process_image_input(self, image_input: Qwen2VLImageInputs) -> torch.Tensor: + pixel_values = image_input.pixel_values.type(self.visual.dtype) + image_embeds = self.visual(pixel_values, grid_thw=image_input.image_grid_thw) + return image_embeds + + def _process_video_input(self, video_input: Qwen2VLVideoInputs) -> torch.Tensor: + pixel_values_videos = video_input.pixel_values_videos.type( + self.visual.dtype) + video_embeds = self.visual(pixel_values_videos, grid_thw=video_input.video_grid_thw) + return video_embeds + + def _merge_multimodal_embeddings( + self, + input_ids: torch.Tensor, + input_embeds: torch.Tensor, + image_embeds: Optional[torch.Tensor], + video_embeds: Optional[torch.Tensor], + image_placeholder_token_id: int, + video_placeholder_token_id: int, + ) -> torch.Tensor: + pass + def forward( self, input_ids: torch.Tensor, @@ -711,47 +762,42 @@ def forward( pixel_values_videos: Pixel values of videos to be fed to a model. `None` if no videos are passed. video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. `None` if no videos are passed. """ - pixel_values: torch.Tensor = kwargs.get('pixel_values', None) - image_grid_thw: torch.Tensor = kwargs.get('image_grid_thw', None) - pixel_values_videos: torch.Tensor = kwargs.get('pixel_values_videos', - None) - video_grid_thw: torch.Tensor = kwargs.get('video_grid_thw', None) - no_vision = pixel_values is None and pixel_values_videos is None + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) - if no_vision: + if image_input is None and video_input is None: inputs_embeds = None else: - inputs_embeds = self.model.embed_tokens(input_ids) - if getattr(self.config, "rope_scaling", {}).get("type", None) == "mrope": assert positions.ndim == 2 and positions.size(0) == 3, \ f"multimodal section rotary embedding requires (3, seq_len) positions, but got {positions.size()}" - if pixel_values is not None: - pixel_values = pixel_values.type(self.visual.get_dtype()) - image_embeds = self.visual(pixel_values, - grid_thw=image_grid_thw) - image_mask = (input_ids == self.config.image_token_id) - inputs_embeds[image_mask, :] = image_embeds - if pixel_values_videos is not None: - pixel_values_videos = pixel_values_videos.type( - self.visual.get_dtype()) - video_embeds = self.visual(pixel_values_videos, - grid_thw=video_grid_thw) - video_mask = (input_ids == self.config.video_token_id) - inputs_embeds[video_mask, :] = video_embeds + inputs_embeds = self.model.embed_tokens(input_ids) + + if image_input is not None: + image_embeds = self._process_image_input(image_input) + inputs_embeds = merge_multimodal_embeddings_for_qwen2_vl( + input_ids, inputs_embeds, image_embeds, placeholder_token_id=self.config.image_token_id, + ) + + if video_input is not None: + video_embeds = self._process_video_input(video_input) + inputs_embeds = merge_multimodal_embeddings_for_qwen2_vl( + input_ids, inputs_embeds, video_embeds, placeholder_token_id=self.config.video_token_id, + ) + input_ids = None - result = self.model( + hidden_states = self.model( input_ids=input_ids, positions=positions, kv_caches=kv_caches, attn_metadata=attn_metadata, inputs_embeds=inputs_embeds, ) - return result + return hidden_states def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index b1583379a158d..032964fe0ac4e 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -79,21 +79,15 @@ def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs: if len(inputs_list) == 0: return {} - keys = inputs_list[0].keys() - item_lists: Dict[str, List[NestedTensors]] = defaultdict(list) for inputs in inputs_list: - if inputs.keys() != keys: - msg = f"Inputs do not share the same keys ({keys})" - raise ValueError(msg) + # For models that supports multiple modalities (e.g. Qwen2-VL), + # different modalities will return different data keys, + # so batch() should skip the same key check. for k, v in inputs.items(): - # For models that supports multiple modalities (e.g. Qwen2-VL), - # input mapper will set values of unused modality keys to None, - # and batching procedure should skip them. - if v is not None: - item_lists[k].append(v) + item_lists[k].append(v) return { k: MultiModalInputs._try_stack(item_list) diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py new file mode 100644 index 0000000000000..2001746c5f7f9 --- /dev/null +++ b/vllm/transformers_utils/processor.py @@ -0,0 +1,37 @@ +from typing import cast + + +def get_processor( + processor_name: str, + *args, + trust_remote_code: bool = False, + **kwargs, +): + """Gets a processor for the given model name via HuggingFace.""" + # don't put this import at the top level + # it will call torch.cuda.device_count() + from transformers import AutoProcessor + from transformers.processing_utils import ProcessorMixin + + try: + processor = AutoProcessor.from_pretrained( + processor_name, + *args, + trust_remote_code=trust_remote_code, + **kwargs) + except ValueError as e: + # If the error pertains to the processor class not existing or not + # currently being imported, suggest using the --trust-remote-code flag. + # Unlike AutoTokenizer, AutoProcessor does not separate such errors + if not trust_remote_code: + err_msg = ( + "Failed to load the processor. If the processor is " + "a custom processor not yet available in the HuggingFace " + "transformers library, consider setting " + "`trust_remote_code=True` in LLM or using the " + "`--trust-remote-code` flag in the CLI.") + raise RuntimeError(err_msg) from e + else: + raise e + + return cast(ProcessorMixin, processor) From 14fe12a3758a338459859e0f48295a19436948ed Mon Sep 17 00:00:00 2001 From: "suyang.fy" Date: Thu, 29 Aug 2024 15:26:44 +0800 Subject: [PATCH 09/34] reformat --- .../model_executor/layers/rotary_embedding.py | 6 +- vllm/model_executor/models/qwen2_vl.py | 55 ++++++++++--------- vllm/worker/model_runner.py | 4 +- 3 files changed, 35 insertions(+), 30 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index b4bd199474804..8f488e273b4b7 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -860,9 +860,9 @@ def get_input_positions( ) -> Tuple[List[List[int]], int]: """Get mrope input positions and delta value.""" - if torch.is_tensor(image_grid_thw): + if isinstance(image_grid_thw, torch.Tensor): image_grid_thw = image_grid_thw.tolist() - if torch.is_tensor(video_grid_thw): + if isinstance(video_grid_thw, torch.Tensor): video_grid_thw = video_grid_thw.tolist() input_tokens_tensor = torch.tensor(input_tokens) @@ -989,7 +989,7 @@ def get_rope( # The correct one should be "longrope" but keep "su" here # for backward compatible if scaling_type not in {"su", "longrope"}: - scaling_factor = rope_scaling.get("factor") + scaling_factor = rope_scaling.get("factor", 1.0) if scaling_type == "llama3": low_freq_factor = rope_scaling["low_freq_factor"] high_freq_factor = rope_scaling["high_freq_factor"] diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 85bdec3e65e00..7bbc1339a0ef9 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -64,7 +64,6 @@ logger = init_logger(__name__) - # === Vision Inputs === # @@ -89,6 +88,7 @@ class Qwen2VLVideoInputs(TypedDict): This should be in `(grid_t, grid_h, grid_w)` format. """ + # === Vision Encoder === # @@ -128,12 +128,10 @@ def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor: two=2) -def apply_rotary_emb_torch( - x: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, - interleaved: bool = False -) -> torch.Tensor: +def apply_rotary_emb_torch(x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + interleaved: bool = False) -> torch.Tensor: """ x: (batch_size, seqlen, nheads, headdim) cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2) @@ -155,7 +153,8 @@ def apply_rotary_emb_torch( ) -def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: +def apply_rotary_pos_emb_vision(t: torch.Tensor, + freqs: torch.Tensor) -> torch.Tensor: t_ = t.float() cos = freqs.cos() sin = freqs.sin() @@ -629,11 +628,9 @@ def input_processor_for_qwen2_vl(ctx: InputContext, def merge_multimodal_embeddings_for_qwen2_vl( - input_ids: torch.Tensor, - inputs_embeds: torch.Tensor, - multimodal_embeddings: torch.Tensor, - placeholder_token_id: int -) -> torch.Tensor: + input_ids: torch.Tensor, inputs_embeds: torch.Tensor, + multimodal_embeddings: torch.Tensor, + placeholder_token_id: int) -> torch.Tensor: mask = (input_ids == placeholder_token_id) inputs_embeds[mask, :] = multimodal_embeddings return inputs_embeds @@ -694,10 +691,8 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of image grid_thw. " f"Got type: {type(image_grid_thw)}") - return Qwen2VLImageInputs( - pixel_values=pixel_values, - image_grid_thw=image_grid_thw - ) + return Qwen2VLImageInputs(pixel_values=pixel_values, + image_grid_thw=image_grid_thw) def _parse_and_validate_video_input( self, **kwargs: object) -> Optional[Qwen2VLVideoInputs]: @@ -720,15 +715,19 @@ def _parse_and_validate_video_input( video_grid_thw=video_grid_thw, ) - def _process_image_input(self, image_input: Qwen2VLImageInputs) -> torch.Tensor: - pixel_values = image_input.pixel_values.type(self.visual.dtype) - image_embeds = self.visual(pixel_values, grid_thw=image_input.image_grid_thw) + def _process_image_input(self, + image_input: Qwen2VLImageInputs) -> torch.Tensor: + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + image_embeds = self.visual(pixel_values, + grid_thw=image_input["image_grid_thw"]) return image_embeds - def _process_video_input(self, video_input: Qwen2VLVideoInputs) -> torch.Tensor: - pixel_values_videos = video_input.pixel_values_videos.type( + def _process_video_input(self, + video_input: Qwen2VLVideoInputs) -> torch.Tensor: + pixel_values_videos = video_input["pixel_values_videos"].type( self.visual.dtype) - video_embeds = self.visual(pixel_values_videos, grid_thw=video_input.video_grid_thw) + video_embeds = self.visual(pixel_values_videos, + grid_thw=video_input["video_grid_thw"]) return video_embeds def _merge_multimodal_embeddings( @@ -779,13 +778,19 @@ def forward( if image_input is not None: image_embeds = self._process_image_input(image_input) inputs_embeds = merge_multimodal_embeddings_for_qwen2_vl( - input_ids, inputs_embeds, image_embeds, placeholder_token_id=self.config.image_token_id, + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=self.config.image_token_id, ) if video_input is not None: video_embeds = self._process_video_input(video_input) inputs_embeds = merge_multimodal_embeddings_for_qwen2_vl( - input_ids, inputs_embeds, video_embeds, placeholder_token_id=self.config.video_token_id, + input_ids, + inputs_embeds, + video_embeds, + placeholder_token_id=self.config.video_token_id, ) input_ids = None diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 6671abe48f561..900b7ccf27927 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -174,7 +174,7 @@ class InterDataForSeqGroup: def simple_reinit(self): self.input_tokens[0].clear() # type: ignore self.input_positions[0].clear() # type: ignore - self.mrope_input_positions = None # type: ignore + self.mrope_input_positions = None # type: ignore self.seq_lens[0] = 0 # type: ignore self.orig_seq_lens[0] = 0 # type: ignore self.query_lens[0] = 0 # type: ignore @@ -731,7 +731,7 @@ def build(self) -> ModelInputForGPU: # prefix caching and there is no decode request. return self.model_input_cls() - mrope_input_positions = None + mrope_input_positions: Optional[List[List[int]]] = None if any(inter_data.mrope_input_positions is not None for inter_data in self.inter_data_list): mrope_input_positions = [[] for _ in range(3)] From 89def23b9370217ec2cda03e69f16249946ef96f Mon Sep 17 00:00:00 2001 From: "suyang.fy" Date: Thu, 29 Aug 2024 15:45:12 +0800 Subject: [PATCH 10/34] reformat --- .../model_executor/layers/rotary_embedding.py | 3 +- vllm/model_executor/models/qwen2_vl.py | 117 +++++++++++------- vllm/worker/model_runner.py | 40 +++--- 3 files changed, 93 insertions(+), 67 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 8f488e273b4b7..e8cc63064ca54 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -904,7 +904,8 @@ def get_input_positions( video_index += 1 remain_videos -= 1 ed = ed_video - llm_grid_t, llm_grid_h, llm_grid_w = t, h // spatial_merge_size, w // spatial_merge_size + llm_grid_t, llm_grid_h, llm_grid_w = \ + t, h // spatial_merge_size, w // spatial_merge_size text_len = ed - st st_idx = llm_pos_ids_list[-1].max() + 1 if len( diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 7bbc1339a0ef9..0c2fe7d48ef7f 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -25,22 +25,23 @@ from array import array from collections.abc import Mapping -from functools import partial, lru_cache -from typing import Tuple, Optional, List, Iterable, Type, TypedDict +from functools import lru_cache, partial +from typing import Iterable, List, Optional, Tuple, Type, TypedDict import torch import torch.nn as nn import torch.nn.functional as F -from PIL import Image from einops import rearrange, repeat # from vllm_flash_attn.flash_attn_interface import flash_attn_varlen_func from flash_attn import flash_attn_varlen_func +from PIL import Image from transformers import Qwen2VLConfig -from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig +from transformers.models.qwen2_vl.configuration_qwen2_vl import ( + Qwen2VLVisionConfig) from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize from vllm.attention import AttentionMetadata -from vllm.config import MultiModalConfig, CacheConfig +from vllm.config import CacheConfig, MultiModalConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs @@ -56,10 +57,12 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import SupportsMultiModal from vllm.model_executor.models.qwen2 import Qwen2Model -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalInputs +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, + MultiModalInputs) from vllm.multimodal.base import MultiModalData from vllm.multimodal.image import cached_get_image_processor -from vllm.sequence import SequenceData, SamplerOutput, IntermediateTensors, VLLM_TOKEN_ID_ARRAY_TYPE +from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, + SamplerOutput, SequenceData) from vllm.transformers_utils.processor import get_processor logger = init_logger(__name__) @@ -69,7 +72,9 @@ class Qwen2VLImageInputs(TypedDict): pixel_values: torch.Tensor - """Shape: `(num_patches, num_channels * patch_size * patch_size)`""" + """Shape: + `(num_patches, num_channels * patch_size * patch_size)` + """ image_grid_thw: torch.Tensor """Shape: `(num_images, 3)` @@ -80,7 +85,10 @@ class Qwen2VLImageInputs(TypedDict): class Qwen2VLVideoInputs(TypedDict): pixel_values_videos: torch.Tensor - """Shape: `(num_patches, num_channels * temporal_patch_size * patch_size * patch_size)`""" + """Shape: + `(num_patches, + num_channels * temporal_patch_size * patch_size * patch_size)` + """ video_grid_thw: torch.Tensor """Shape: `(num_videos, 3)` @@ -207,12 +215,12 @@ def forward( batch_size = q.shape[1] q, k, v = [ - rearrange(x, 's b ... -> b s ...').contiguous() for x in (q, k, v) + rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v) ] if rotary_pos_emb is not None: q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) - q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]] + q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]] max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() output = flash_attn_varlen_func(q, @@ -225,9 +233,9 @@ def forward( 0, causal=False) - context_layer = rearrange(output, '(b s) ... -> b s ...', b=batch_size) + context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size) context_layer = rearrange(context_layer, - 'b s h d -> s b (h d)').contiguous() + "b s h d -> s b (h d)").contiguous() output, _ = self.proj(context_layer) return output @@ -241,10 +249,12 @@ def __init__( num_heads: int, mlp_ratio: float, act_layer: Type[nn.Module] = QuickGELU, - norm_layer: Type[nn.Module] = partial(nn.LayerNorm, eps=1e-6), + norm_layer: Type[nn.Module] = None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) self.norm1 = norm_layer(dim) self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) @@ -301,12 +311,14 @@ def __init__( self, d_model: int, context_dim: int, - norm_layer: Type[nn.Module] = partial(nn.LayerNorm, eps=1e-6), + norm_layer: Type[nn.Module] = None, spatial_merge_size: int = 2, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() self.hidden_size = context_dim * (spatial_merge_size**2) + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) self.ln_q = norm_layer(context_dim) self.mlp = nn.ModuleList([ ColumnParallelLinear(self.hidden_size, @@ -459,7 +471,7 @@ def forward( cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( dim=0, dtype=torch.int32) - cu_seqlens = F.pad(cu_seqlens, (1, 0), 'constant', 0) + cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0) # transformers x = x.unsqueeze(1) @@ -517,7 +529,7 @@ def mm_input_mapper_for_qwen2_vl( def _get_max_image_info(image_processor, - data_type_key: str = 'image', + data_type_key: str = "image", mm_count: int = 1): max_resized_height, max_resized_width = smart_resize( height=9999999, @@ -529,24 +541,26 @@ def _get_max_image_info(image_processor, max_pixels=min(image_processor.max_pixels, 1280 * 28 * 28), ) - if data_type_key == 'image': + if data_type_key == "image": max_grid_t = mm_count else: - assert data_type_key == 'video' + assert data_type_key == "video" max_grid_t = max(mm_count // image_processor.temporal_patch_size, 1) max_grid_h = max_resized_height // image_processor.patch_size max_grid_w = max_resized_width // image_processor.patch_size max_image_tokens = max_grid_t * max_grid_h * max_grid_w - max_llm_image_tokens = max_image_tokens // image_processor.merge_size // image_processor.merge_size + max_llm_image_tokens = (max_image_tokens // image_processor.merge_size // + image_processor.merge_size) return max_resized_height, max_resized_width, max_llm_image_tokens def get_max_qwen2_vl_mm_tokens(ctx: InputContext, data_type_key: str) -> int: image_processor = cached_get_image_processor(ctx.model_config.model) - max_resized_height, max_resized_width, max_llm_image_tokens = _get_max_image_info( - image_processor, data_type_key=data_type_key, mm_count=1) + max_resized_height, max_resized_width, max_llm_image_tokens = \ + _get_max_image_info(image_processor, data_type_key=data_type_key, + mm_count=1) return max_llm_image_tokens @@ -562,23 +576,25 @@ def dummy_data_for_qwen2_vl( image_processor = cached_get_image_processor(ctx.model_config.model) num_images = mm_counts["image"] - max_resized_height, max_resized_width, max_llm_image_tokens = _get_max_image_info( - image_processor, data_type_key="image", mm_count=num_images) + max_resized_height, max_resized_width, max_llm_image_tokens = \ + _get_max_image_info(image_processor, data_type_key="image", + mm_count=num_images) if seq_len - max_llm_image_tokens - 2 < 0: raise RuntimeError( - f'Qwen2-VL cannot process {num_images} images in a prompt, ' - f'please increase max_model_len or reduce image limit by --limit-mm-per-prompt.' - ) + f"Qwen2-VL cannot process {num_images} images in a prompt, " + "please increase max_model_len or reduce image limit by " + "--limit-mm-per-prompt.") # Check video counts. num_videos = mm_counts["video"] - max_resized_height, max_resized_width, max_llm_video_tokens = _get_max_image_info( - image_processor, data_type_key="video", mm_count=num_videos) + max_resized_height, max_resized_width, max_llm_video_tokens = \ + _get_max_image_info(image_processor, data_type_key="video", + mm_count=num_videos) if seq_len - max_llm_video_tokens - 2 < 0: raise RuntimeError( - f'Qwen2-VL cannot process {num_images} videos in a prompt, ' - f'please increase max_model_len or reduce video limit by --limit-mm-per-prompt.' - ) + f"Qwen2-VL cannot process {num_images} videos in a prompt, " + "please increase max_model_len or reduce video limit by " + "--limit-mm-per-prompt.") hf_config = ctx.get_hf_config(Qwen2VLConfig) token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, @@ -609,9 +625,9 @@ def input_processor_for_qwen2_vl(ctx: InputContext, processor = cached_get_processor(ctx.model_config.model) - prompt = llm_inputs['prompt'] + prompt = llm_inputs["prompt"] if prompt is None: - prompt_token_ids = llm_inputs['prompt_token_ids'] + prompt_token_ids = llm_inputs["prompt_token_ids"] prompt = processor.tokenizer.decode(prompt_token_ids) inputs = processor(text=[prompt], @@ -621,8 +637,8 @@ def input_processor_for_qwen2_vl(ctx: InputContext, return_tensors="pt") return LLMInputs( - prompt_token_ids=inputs['input_ids'][0].tolist(), - prompt=llm_inputs['prompt'], + prompt_token_ids=inputs["input_ids"][0].tolist(), + prompt=llm_inputs["prompt"], multi_modal_data=multi_modal_data, ) @@ -659,7 +675,7 @@ def __init__(self, self.visual = Qwen2VisionTransformer( config.vision_config, - norm_eps=getattr(config, 'rms_norm_eps', 1e-6), + norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=quant_config, ) @@ -753,13 +769,21 @@ def forward( """Run forward pass for Qwen2-VL. Args: - input_ids: Flattened (concatenated) input_ids corresponding to a batch. - positions: Flattened (concatenated) position ids corresponding to a batch. - NOTE: If mrope is enabled (default setting for Qwen2-VL opensource models), the shape will be `(3, seq_len)`, otherwise it will be `(seq_len,). - pixel_values: Pixel values to be fed to a model. `None` if no images are passed. - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. `None` if no images are passed. - pixel_values_videos: Pixel values of videos to be fed to a model. `None` if no videos are passed. - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. `None` if no videos are passed. + input_ids: Flattened (concatenated) input_ids corresponding to a + batch. + positions: Flattened (concatenated) position ids corresponding to a + batch. + **NOTE**: If mrope is enabled (default setting for Qwen2-VL + opensource models), the shape will be `(3, seq_len)`, + otherwise it will be `(seq_len,). + pixel_values: Pixel values to be fed to a model. + `None` if no images are passed. + image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. + `None` if no images are passed. + pixel_values_videos: Pixel values of videos to be fed to a model. + `None` if no videos are passed. + video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. + `None` if no videos are passed. """ image_input = self._parse_and_validate_image_input(**kwargs) @@ -770,8 +794,9 @@ def forward( else: if getattr(self.config, "rope_scaling", {}).get("type", None) == "mrope": - assert positions.ndim == 2 and positions.size(0) == 3, \ - f"multimodal section rotary embedding requires (3, seq_len) positions, but got {positions.size()}" + assert positions.ndim == 2 and positions.size(0) == 3, ( + "multimodal section rotary embedding requires " + f"(3, seq_len) positions, but got {positions.size()}") inputs_embeds = self.model.embed_tokens(input_ids) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 900b7ccf27927..202a9c1d8ba21 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -29,6 +29,7 @@ from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata, SamplingMetadataCache +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.model_executor.models.interfaces import (supports_lora, @@ -496,8 +497,6 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, if inter_data.mrope_input_positions is None: inter_data.mrope_input_positions = [None] * inter_data.n_seqs - from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding - inter_data.mrope_input_positions[ seq_idx] = MRotaryEmbedding.get_next_input_positions( seq_data.mrope_position_delta, @@ -653,31 +652,31 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, if self.runner.model_is_mrope: image_grid_thw = mm_kwargs.get("image_grid_thw", None) video_grid_thw = mm_kwargs.get("video_grid_thw", None) - assert image_grid_thw is not None or video_grid_thw is not None, \ - "mrope embedding type requires multi-modal input mapper returns 'image_grid_thw' or 'video_grid_thw'." + assert image_grid_thw is not None or video_grid_thw is not None, ( + "mrope embedding type requires multi-modal input mapper " + "returns 'image_grid_thw' or 'video_grid_thw'.") hf_config = self.runner.model_config.hf_config - from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding - inter_data.mrope_input_positions = [None] * inter_data.n_seqs for seq_idx in range(inter_data.n_seqs): seq_data = seq_group_metadata.seq_data[ inter_data.seq_ids[seq_idx]] token_ids = seq_data.get_token_ids() - mrope_input_positions, mrope_position_delta = MRotaryEmbedding.get_input_positions( - token_ids, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - image_token_id=hf_config.image_token_id, - video_token_id=hf_config.video_token_id, - vision_start_token_id=hf_config.vision_start_token_id, - vision_end_token_id=hf_config.vision_end_token_id, - spatial_merge_size=hf_config.vision_config. - spatial_merge_size, - context_len=inter_data.context_lens[seq_idx], - ) + mrope_input_positions, mrope_position_delta = \ + MRotaryEmbedding.get_input_positions( + token_ids, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + image_token_id=hf_config.image_token_id, + video_token_id=hf_config.video_token_id, + vision_start_token_id=hf_config.vision_start_token_id, + vision_end_token_id=hf_config.vision_end_token_id, + spatial_merge_size=hf_config.vision_config. + spatial_merge_size, + context_len=inter_data.context_lens[seq_idx], + ) seq_data.mrope_position_delta = mrope_position_delta inter_data.mrope_input_positions[ @@ -737,12 +736,13 @@ def build(self) -> ModelInputForGPU: mrope_input_positions = [[] for _ in range(3)] for idx in range(3): for inter_data in self.inter_data_list: - if inter_data.mrope_input_positions is None: + msections = inter_data.mrope_input_positions + if msections is None: for _seq_input_positions in inter_data.input_positions: mrope_input_positions[idx].extend( _seq_input_positions) else: - for _seq_mrope_input_positions in inter_data.mrope_input_positions: + for _seq_mrope_input_positions in msections: mrope_input_positions[idx].extend( _seq_mrope_input_positions[idx]) input_positions = None From e721e608653ea00bb8ad18506b3cd399049a083a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Thu, 29 Aug 2024 16:20:48 +0800 Subject: [PATCH 11/34] Fix the bug of model_is_mrope in model_runner.py. --- vllm/worker/model_runner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 202a9c1d8ba21..24ea8f6e5d583 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1274,6 +1274,8 @@ def model_is_mrope(self) -> bool: """Detect if the model has "mrope" rope_scaling type. mrope requires keep "rope_deltas" between prompt and decoding phases.""" rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {}) + if rope_scaling is None: + return False return rope_scaling.get("type", None) == "mrope" @torch.inference_mode() From d66d167bd8baeb17a9b4d0495438d4fd665609e1 Mon Sep 17 00:00:00 2001 From: "suyang.fy" Date: Thu, 29 Aug 2024 16:31:18 +0800 Subject: [PATCH 12/34] fix type hints in qwen2_vl.py --- vllm/model_executor/models/qwen2_vl.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 0c2fe7d48ef7f..9e008748e1b5f 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -268,7 +268,8 @@ def __init__( act_layer=act_layer, quant_config=quant_config) - def forward(self, x, cu_seqlens, rotary_pos_emb) -> torch.Tensor: + def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor) -> torch.Tensor: x = x + self.attn(self.norm1(x), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb) @@ -430,7 +431,7 @@ def dtype(self) -> torch.dtype: def device(self) -> torch.device: return self.blocks[0].mlp.fc2.weight.device - def rot_pos_emb(self, grid_thw): + def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: pos_ids = [] for t, h, w in grid_thw: hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) @@ -487,8 +488,6 @@ def forward( cached_get_processor = lru_cache(get_processor) -MAX_TEMPORAL_IMAGE_NUM = 10 - def mm_input_mapper_for_qwen2_vl( ctx: InputContext, From acd85ed3f0d45b2966904418074db6cae33c2a40 Mon Sep 17 00:00:00 2001 From: "suyang.fy" Date: Thu, 29 Aug 2024 18:04:12 +0800 Subject: [PATCH 13/34] Update mm input processors according to new MultiModalInput.batch() implementation. --- vllm/model_executor/models/qwen2_vl.py | 41 ++++++++++++++++++-------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 9e008748e1b5f..bd3ae04ea6e76 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -26,7 +26,7 @@ from array import array from collections.abc import Mapping from functools import lru_cache, partial -from typing import Iterable, List, Optional, Tuple, Type, TypedDict +from typing import Iterable, List, Optional, Tuple, Type, TypedDict, Union import torch import torch.nn as nn @@ -690,6 +690,23 @@ def __init__(self, self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = Sampler() + def _validate_and_reshape_mm_tensor(self, + mm_input: Union[torch.Tensor, + List[torch.Tensor]], + name: str) -> torch.Tensor: + if not isinstance(mm_input, (torch.Tensor, list)): + raise ValueError(f"Incorrect type of {name}. " + f"Got type: {type(mm_input)}") + if isinstance(mm_input, torch.Tensor): + if mm_input.ndim == 2: + return mm_input + if mm_input.ndim != 3: + raise ValueError(f"{name} should be 2D or batched 3D tensor. " + f"Got ndim: {mm_input.ndim}") + return torch.concat(list(mm_input)) + else: + return torch.concat(mm_input) + def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[Qwen2VLImageInputs]: pixel_values = kwargs.pop("pixel_values", None) @@ -698,14 +715,15 @@ def _parse_and_validate_image_input( if pixel_values is None: return None - if not isinstance(pixel_values, torch.Tensor): + pixel_values = self._validate_and_reshape_mm_tensor( + pixel_values, "image pixel values") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") + + if not isinstance(pixel_values, (torch.Tensor, list)): raise ValueError("Incorrect type of image pixel values. " f"Got type: {type(pixel_values)}") - if not isinstance(image_grid_thw, torch.Tensor): - raise ValueError("Incorrect type of image grid_thw. " - f"Got type: {type(image_grid_thw)}") - return Qwen2VLImageInputs(pixel_values=pixel_values, image_grid_thw=image_grid_thw) @@ -717,13 +735,10 @@ def _parse_and_validate_video_input( if pixel_values_videos is None: return None - if not isinstance(pixel_values_videos, torch.Tensor): - raise ValueError("Incorrect type of video pixel values. " - f"Got type: {type(pixel_values_videos)}") - - if not isinstance(video_grid_thw, torch.Tensor): - raise ValueError("Incorrect type of video grid_thw. " - f"Got type: {type(video_grid_thw)}") + pixel_values_videos = self._validate_and_reshape_mm_tensor( + pixel_values_videos, "video pixel values") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") return Qwen2VLVideoInputs( pixel_values_videos=pixel_values_videos, From 87ba5ed0e64e45f26faa448d8178a5b6b03ba7da Mon Sep 17 00:00:00 2001 From: "suyang.fy" Date: Fri, 30 Aug 2024 13:58:18 +0800 Subject: [PATCH 14/34] Fix SamplerOutput. --- vllm/model_executor/models/qwen2_vl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index bd3ae04ea6e76..03890242a4a54 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -52,7 +52,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import SupportsMultiModal @@ -62,7 +62,7 @@ from vllm.multimodal.base import MultiModalData from vllm.multimodal.image import cached_get_image_processor from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, - SamplerOutput, SequenceData) + SequenceData) from vllm.transformers_utils.processor import get_processor logger = init_logger(__name__) From cda300aabe083ebf12849dd331a3fb4d1deae17e Mon Sep 17 00:00:00 2001 From: "suyang.fy" Date: Fri, 30 Aug 2024 15:37:50 +0800 Subject: [PATCH 15/34] Fix bug of quantization. --- vllm/model_executor/models/qwen2_vl.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 03890242a4a54..4758da3af5392 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -675,7 +675,9 @@ def __init__(self, self.visual = Qwen2VisionTransformer( config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=quant_config, + + # NOTE: Qwen2-VL does not support any quantization method now. + quant_config=None, ) self.model = Qwen2Model(config, cache_config, quant_config) @@ -898,7 +900,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): head_size) loaded_weight = loaded_weight.transpose(0, 1) loaded_weight = loaded_weight.reshape(-1) - param = params_dict[name] + try: + param = params_dict[name] + except KeyError: + print(params_dict.keys()) + raise weight_loader = getattr(param, "weight_loader", default_weight_loader) From da03a3f7d81130738a11f909b86d8fb4e8e88281 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Sun, 1 Sep 2024 01:08:47 +0800 Subject: [PATCH 16/34] Bugfix of type hints in qwen2_vl.py. --- vllm/model_executor/models/qwen2_vl.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 4758da3af5392..b1cbdffbb27d3 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -24,9 +24,8 @@ """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" from array import array -from collections.abc import Mapping from functools import lru_cache, partial -from typing import Iterable, List, Optional, Tuple, Type, TypedDict, Union +from typing import Iterable, List, Mapping, Optional, Tuple, Type, TypedDict, Union import torch import torch.nn as nn From 25fb189fa38cd3b742496dc45bc28b89c8dda3b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Sun, 1 Sep 2024 01:27:47 +0800 Subject: [PATCH 17/34] reformat. --- vllm/model_executor/models/qwen2_vl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index b1cbdffbb27d3..e05cf8f45b97b 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -25,7 +25,8 @@ from array import array from functools import lru_cache, partial -from typing import Iterable, List, Mapping, Optional, Tuple, Type, TypedDict, Union +from typing import (Iterable, List, Mapping, Optional, Tuple, Type, TypedDict, + Union) import torch import torch.nn as nn From faebfe4be2af19355f98a50c03aa571ab37188ce Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Sun, 1 Sep 2024 02:16:39 -0700 Subject: [PATCH 18/34] fix typo from resolving conflict --- vllm/entrypoints/chat_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 5f22b8ef01d61..0337b15312c1e 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -143,7 +143,7 @@ def add(self, modality: Literal["image", "audio"], if model_type == "ultravox": return "<|reserved_special_token_0|>" raise TypeError(f"Unknown model type: {model_type}") - elif modality == "video": + elif modality == "video": if model_type == "qwen2_vl": return "<|vision_start|><|video_pad|><|vision_end|>" raise TypeError(f"Unknown model type: {model_type}") From 2e87db7e708724110a84586dc916461ee9db09f7 Mon Sep 17 00:00:00 2001 From: "suyang.fy" Date: Mon, 2 Sep 2024 10:10:45 +0800 Subject: [PATCH 19/34] Bugfix in qwen2_vl.py. --- vllm/model_executor/models/qwen2_vl.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index e05cf8f45b97b..ba2219c307ccd 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -642,15 +642,6 @@ def input_processor_for_qwen2_vl(ctx: InputContext, ) -def merge_multimodal_embeddings_for_qwen2_vl( - input_ids: torch.Tensor, inputs_embeds: torch.Tensor, - multimodal_embeddings: torch.Tensor, - placeholder_token_id: int) -> torch.Tensor: - mask = (input_ids == placeholder_token_id) - inputs_embeds[mask, :] = multimodal_embeddings - return inputs_embeds - - @MULTIMODAL_REGISTRY.register_image_input_mapper( image_input_mapper_for_qwen2_vl) @MULTIMODAL_REGISTRY.register_input_mapper("video", @@ -765,13 +756,13 @@ def _process_video_input(self, def _merge_multimodal_embeddings( self, input_ids: torch.Tensor, - input_embeds: torch.Tensor, - image_embeds: Optional[torch.Tensor], - video_embeds: Optional[torch.Tensor], - image_placeholder_token_id: int, - video_placeholder_token_id: int, + inputs_embeds: torch.Tensor, + multimodal_embeddings: torch.Tensor, + placeholder_token_id: int, ) -> torch.Tensor: - pass + mask = (input_ids == placeholder_token_id) + inputs_embeds[mask, :] = multimodal_embeddings + return inputs_embeds def forward( self, @@ -818,7 +809,7 @@ def forward( if image_input is not None: image_embeds = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings_for_qwen2_vl( + inputs_embeds = self._merge_multimodal_embeddings( input_ids, inputs_embeds, image_embeds, @@ -827,7 +818,7 @@ def forward( if video_input is not None: video_embeds = self._process_video_input(video_input) - inputs_embeds = merge_multimodal_embeddings_for_qwen2_vl( + inputs_embeds = self._merge_multimodal_embeddings( input_ids, inputs_embeds, video_embeds, From 39a106901a3378948dfce534dfb7d5d723f28b77 Mon Sep 17 00:00:00 2001 From: "suyang.fy" Date: Thu, 5 Sep 2024 12:40:41 +0800 Subject: [PATCH 20/34] Adding xformers implementation --- vllm/model_executor/models/qwen2_vl.py | 70 ++++++++++++++++++++------ 1 file changed, 54 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index ba2219c307ccd..a13714ed6e349 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -32,15 +32,16 @@ import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -# from vllm_flash_attn.flash_attn_interface import flash_attn_varlen_func -from flash_attn import flash_attn_varlen_func from PIL import Image from transformers import Qwen2VLConfig from transformers.models.qwen2_vl.configuration_qwen2_vl import ( Qwen2VLVisionConfig) from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize +import vllm.envs as envs from vllm.attention import AttentionMetadata +from vllm.attention.selector import (_Backend, backend_name_to_enum, + get_global_forced_attn_backend) from vllm.config import CacheConfig, MultiModalConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils @@ -61,6 +62,7 @@ MultiModalInputs) from vllm.multimodal.base import MultiModalData from vllm.multimodal.image import cached_get_image_processor +from vllm.platforms import current_platform from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, SequenceData) from vllm.transformers_utils.processor import get_processor @@ -194,6 +196,26 @@ def __init__( output_size=embed_dim, quant_config=quant_config) + # Detect attention implementation. + selected_backend: Optional[_Backend] = get_global_forced_attn_backend() + if selected_backend is None: + backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND + if backend_by_env_var is not None: + selected_backend = backend_name_to_enum(backend_by_env_var) + if selected_backend is None: + # For Volta and Turing GPUs, use xformers instead. + self._use_flash_attn = current_platform.get_device_capability( + )[0] >= 8 + else: + if selected_backend == _Backend.FLASH_ATTN: + self._use_flash_attn = True + elif selected_backend == _Backend.XFORMERS: + self._use_flash_attn = False + else: + raise RuntimeError( + f"Qwen2-VL does not support {selected_backend} backend now." + ) + def forward( self, x: torch.Tensor, @@ -220,20 +242,36 @@ def forward( if rotary_pos_emb is not None: q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) - q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]] - - max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() - output = flash_attn_varlen_func(q, - k, - v, - cu_seqlens, - cu_seqlens, - max_seqlen, - max_seqlen, - 0, - causal=False) - - context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size) + + if self._use_flash_attn: + # from vllm_flash_attn.flash_attn_interface import ( + # flash_attn_varlen_func) + from flash_attn import flash_attn_varlen_func + + q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]] + + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + output = flash_attn_varlen_func(q, + k, + v, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen, + max_seqlen_k=max_seqlen, + dropout_p=0, + causal=False) + + context_layer = rearrange(output, + "(b s) ... -> b s ...", + b=batch_size) + else: + from xformers import ops as xops + + context_layer = xops.memory_efficient_attention_forward(q, + k, + v, + p=0, + scale=None) context_layer = rearrange(context_layer, "b s h d -> s b (h d)").contiguous() From 855c78b49eae46dffdcdfd672b5666bfb6798b5c Mon Sep 17 00:00:00 2001 From: "suyang.fy" Date: Thu, 5 Sep 2024 20:10:48 +0800 Subject: [PATCH 21/34] Fix bug of attn_bias in xformers implementation --- vllm/model_executor/models/qwen2_vl.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index a13714ed6e349..b6f6fa654123f 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -266,12 +266,14 @@ def forward( b=batch_size) else: from xformers import ops as xops + from xformers.ops.fmha.attn_bias import BlockDiagonalMask - context_layer = xops.memory_efficient_attention_forward(q, - k, - v, - p=0, - scale=None) + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, + kv_seqlen=None) + + context_layer = xops.memory_efficient_attention_forward( + q, k, v, attn_bias=attn_bias, p=0, scale=None) context_layer = rearrange(context_layer, "b s h d -> s b (h d)").contiguous() From 091983f60bb97de1e20f4cc963cf30181afd1e22 Mon Sep 17 00:00:00 2001 From: "suyang.fy" Date: Fri, 6 Sep 2024 12:27:47 +0800 Subject: [PATCH 22/34] Fix bug in xformers implementation, and add backend check in vision attention. --- vllm/model_executor/models/qwen2_vl.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index b6f6fa654123f..7d6ea54379371 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -22,7 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" - +import importlib.util from array import array from functools import lru_cache, partial from typing import (Iterable, List, Mapping, Optional, Tuple, Type, TypedDict, @@ -204,8 +204,26 @@ def __init__( selected_backend = backend_name_to_enum(backend_by_env_var) if selected_backend is None: # For Volta and Turing GPUs, use xformers instead. - self._use_flash_attn = current_platform.get_device_capability( - )[0] >= 8 + device_available = current_platform.get_device_capability()[0] >= 8 + if device_available: + if spec := importlib.util.find_spec("flash_attn") is not None: + flash_attn = importlib.util.module_from_spec(spec) + flash_attn_available = hasattr(flash_attn, + "flash_attn_varlen_func") + else: + flash_attn_available = False + + if flash_attn_available: + self._use_flash_attn = True + else: + logger.warning( + "Current Qwen2-VL implementation has a bug with " + "`vllm-flash-attn` inside vision module, so we use " + "xformers backend instead. You can run `pip install " + "flash-attn to use flash-attention backend.") + self._use_flash_attn = False + else: + self._use_flash_attn = False else: if selected_backend == _Backend.FLASH_ATTN: self._use_flash_attn = True From 773958865faa572134f804fbe42939962876ab56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Fri, 6 Sep 2024 14:45:08 +0800 Subject: [PATCH 23/34] Bugfix in qwen2_vl.py. --- vllm/model_executor/models/qwen2_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 7d6ea54379371..e07c871c1a5e7 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -206,7 +206,7 @@ def __init__( # For Volta and Turing GPUs, use xformers instead. device_available = current_platform.get_device_capability()[0] >= 8 if device_available: - if spec := importlib.util.find_spec("flash_attn") is not None: + if spec := importlib.util.find_spec("flash_attn"): flash_attn = importlib.util.module_from_spec(spec) flash_attn_available = hasattr(flash_attn, "flash_attn_varlen_func") From 5bab9bae04196df8b45143d2defd8ac8e8524dbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Fri, 6 Sep 2024 15:05:50 +0800 Subject: [PATCH 24/34] Bugfix in qwen2_vl.py. --- vllm/model_executor/models/qwen2_vl.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index e07c871c1a5e7..265c0e4062141 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -206,14 +206,9 @@ def __init__( # For Volta and Turing GPUs, use xformers instead. device_available = current_platform.get_device_capability()[0] >= 8 if device_available: - if spec := importlib.util.find_spec("flash_attn"): - flash_attn = importlib.util.module_from_spec(spec) - flash_attn_available = hasattr(flash_attn, - "flash_attn_varlen_func") - else: - flash_attn_available = False + from transformers.utils import is_flash_attn_2_available - if flash_attn_available: + if is_flash_attn_2_available(): self._use_flash_attn = True else: logger.warning( From 45873463aa63eab81b8dbfeec4748b269f46b610 Mon Sep 17 00:00:00 2001 From: "suyang.fy" Date: Fri, 6 Sep 2024 15:50:01 +0800 Subject: [PATCH 25/34] reformat. --- vllm/model_executor/models/qwen2_vl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 265c0e4062141..c26bcd5d0353a 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -22,7 +22,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" -import importlib.util from array import array from functools import lru_cache, partial from typing import (Iterable, List, Mapping, Optional, Tuple, Type, TypedDict, From ffad79fece08b43eccf521f6d9bcb830c0c07a3a Mon Sep 17 00:00:00 2001 From: "suyang.fy" Date: Fri, 6 Sep 2024 19:10:40 +0800 Subject: [PATCH 26/34] Refactor MRotaryEmbedding. --- docs/source/models/supported_models.rst | 10 +-- .../model_executor/layers/rotary_embedding.py | 66 ++++++++----------- 2 files changed, 32 insertions(+), 44 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index b96dccd90d0b7..5a23080548cd5 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -227,11 +227,6 @@ Multimodal Language Models - Image\ :sup:`E+` - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - - * - :code:`Qwen2VLForConditionalGeneration` - - Qwen2-VL - - Image / Video - - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - - * - :code:`PaliGemmaForConditionalGeneration` - PaliGemma - Image\ :sup:`E` @@ -252,6 +247,11 @@ Multimodal Language Models - Image - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. - + * - :code:`Qwen2VLForConditionalGeneration` + - Qwen2-VL + - Image\ :sup:`+` / Video\ :sup:`+` + - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. + - * - :code:`UltravoxModel` - Ultravox - Audio\ :sup:`E+` diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index c61beff6868e2..7fa6c5e7fcde4 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -738,22 +738,23 @@ def forward( query: torch.Tensor, key: torch.Tensor, ) -> Tuple[torch.Tensor, torch.Tensor]: - """PyTorch-native implementation equivalent to forward().""" - - qk_ndim_in = query.ndim - - query = query.view(*query.shape[:-1], -1, self.head_size) - key = key.view(*key.shape[:-1], -1, self.head_size) - - query_rot = query[..., :self.rotary_dim] - key_rot = key[..., :self.rotary_dim] - if self.rotary_dim < self.head_size: - query_pass = query[..., self.rotary_dim:] - key_pass = key[..., self.rotary_dim:] - + """PyTorch-native implementation equivalent to forward(). + + Args: + positions: + [num_tokens,] (text only) or + [3, num_tokens] (T/H/W positions with multimodal inputs) + query: [num_tokens, num_heads * head_size] + key: [num_tokens, num_kv_heads * head_size] + """ + assert positions.ndim == 1 or positions.ndim == 2 + + num_tokens = positions.shape[-1] cos_sin = self.cos_sin_cache[positions] cos, sin = cos_sin.chunk(2, dim=-1) - if self.mrope_section and positions.ndim == query.ndim - 1: + if positions.ndim == 2: + assert self.mrope_section + cos = torch.cat([ m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1)) @@ -765,32 +766,19 @@ def forward( ], dim=-1) - if self.is_neox_style: - # NOTE(woosuk): Here we assume that the positions tensor has the - # shape [batch_size, seq_len]. - cos = cos.repeat(1, 1, 2).unsqueeze(-2) - sin = sin.repeat(1, 1, 2).unsqueeze(-2) - else: - cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) - sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) - - rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj - query_rot = query_rot * cos + rotate_fn(query_rot) * sin - key_rot = key_rot * cos + rotate_fn(key_rot) * sin - - if self.rotary_dim < self.head_size: - query = torch.cat((query_rot, query_pass), dim=-1) - key = torch.cat((key_rot, key_pass), dim=-1) - else: - query = query_rot - key = key_rot - - query = query.flatten(-2) - key = key.flatten(-2) - if query.ndim > qk_ndim_in: - query = query.squeeze(0) - key = key.squeeze(1) + query_shape = query.shape + query = query.view(num_tokens, -1, self.head_size) + query_rot = query[..., :self.rotary_dim] + query_pass = query[..., self.rotary_dim:] + query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style) + query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + key_shape = key.shape + key = key.view(num_tokens, -1, self.head_size) + key_rot = key[..., :self.rotary_dim] + key_pass = key[..., self.rotary_dim:] + key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style) + key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) return query, key @staticmethod From d52741754c291def54cdfefb49a4a1d19dcd3483 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Mon, 9 Sep 2024 15:28:17 +0800 Subject: [PATCH 27/34] Add "video" into ModalityStr. --- vllm/entrypoints/chat_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index a1d6bc3dd831d..598ea1c0dd503 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -107,7 +107,7 @@ class ConversationMessage(TypedDict, total=False): """The tool calls generated by the model, such as function calls.""" -ModalityStr = Literal["image", "audio"] +ModalityStr = Literal["image", "audio", "video"] _T = TypeVar("_T") From 6f3116c9dad0537b2858af703938aa9bf6c25bcf Mon Sep 17 00:00:00 2001 From: "suyang.fy" Date: Mon, 9 Sep 2024 19:50:56 +0800 Subject: [PATCH 28/34] Add Qwen2-VL examples. --- examples/offline_inference_vision_language.py | 18 +++++ ...e_inference_vision_language_multi_image.py | 68 +++++++++++++++++-- 2 files changed, 79 insertions(+), 7 deletions(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index aa1580343aee7..14e398b795887 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -173,6 +173,23 @@ def run_qwen_vl(question): return llm, prompt, stop_token_ids +# Qwen2-VL +def run_qwen2_vl(question): + model_name = "Qwen/Qwen2-VL-7B-Instruct" + + llm = LLM( + model=model_name, + max_num_seqs=5, + ) + + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = None + return llm, prompt, stop_token_ids + + model_example_map = { "llava": run_llava, "llava-next": run_llava_next, @@ -184,6 +201,7 @@ def run_qwen_vl(question): "blip-2": run_blip2, "internvl_chat": run_internvl, "qwen_vl": run_qwen_vl, + "qwen2_vl": run_qwen2_vl, } diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index dd84627b9dc58..92d5c58ee6fd1 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -6,7 +6,7 @@ from argparse import Namespace from typing import List -from transformers import AutoTokenizer +from transformers import AutoProcessor, AutoTokenizer from vllm import LLM, SamplingParams from vllm.multimodal.utils import fetch_image @@ -30,7 +30,7 @@ def load_phi3v(question, image_urls: List[str]): for i, _ in enumerate(image_urls, start=1)) prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n" stop_token_ids = None - return llm, prompt, stop_token_ids + return llm, prompt, stop_token_ids, None def load_internvl(question, image_urls: List[str]): @@ -60,18 +60,72 @@ def load_internvl(question, image_urls: List[str]): # https://huggingface.co/OpenGVLab/InternVL2-2B#service stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] - return llm, prompt, stop_token_ids + + return llm, prompt, stop_token_ids, None + + +def load_qwen2_vl(question, image_urls: List[str]): + try: + from qwen_vl_utils import process_vision_info + except ModuleNotFoundError: + print('WARNING: `qwen-vl-utils` not installed, input images will not ' + 'be automatically resized. You can enable this functionality by ' + '`pip install qwen-vl-utils`.') + process_vision_info = None + + model_name = "Qwen/Qwen2-VL-7B-Instruct" + + llm = LLM( + model=model_name, + max_num_seqs=5, + max_model_len=4096, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": + "user", + "content": [ + *placeholders, + { + "type": "text", + "text": question + }, + ], + }] + + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + stop_token_ids = None + + if process_vision_info is None: + image_data = [fetch_image(url) for url in image_urls] + else: + image_data, _ = process_vision_info(messages) + + return llm, prompt, stop_token_ids, image_data model_example_map = { "phi3_v": load_phi3v, "internvl_chat": load_internvl, + "qwen2_vl": load_qwen2_vl, } def run_generate(model, question: str, image_urls: List[str]): - llm, prompt, stop_token_ids = model_example_map[model](question, - image_urls) + llm, prompt, stop_token_ids, image_data = model_example_map[model]( + question, image_urls) + if image_data is None: + image_data = [fetch_image(url) for url in image_urls] sampling_params = SamplingParams(temperature=0.0, max_tokens=128, @@ -81,7 +135,7 @@ def run_generate(model, question: str, image_urls: List[str]): { "prompt": prompt, "multi_modal_data": { - "image": [fetch_image(url) for url in image_urls] + "image": image_data }, }, sampling_params=sampling_params) @@ -92,7 +146,7 @@ def run_generate(model, question: str, image_urls: List[str]): def run_chat(model: str, question: str, image_urls: List[str]): - llm, _, stop_token_ids = model_example_map[model](question, image_urls) + llm, _, stop_token_ids, _ = model_example_map[model](question, image_urls) sampling_params = SamplingParams(temperature=0.0, max_tokens=128, From 386f3029fe25ab86890e1aa8cbaa50656ecca37c Mon Sep 17 00:00:00 2001 From: "suyang.fy" Date: Tue, 10 Sep 2024 23:05:31 +0800 Subject: [PATCH 29/34] Optimizer Qwen2-VL input processor. Update document. --- vllm/model_executor/models/qwen2_vl.py | 198 +++++++++++++++++++++---- 1 file changed, 166 insertions(+), 32 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index c26bcd5d0353a..3f8c590a39b00 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -33,9 +33,13 @@ from einops import rearrange, repeat from PIL import Image from transformers import Qwen2VLConfig +from transformers.image_utils import (get_image_size, + infer_channel_dimension_format, + to_numpy_array) from transformers.models.qwen2_vl.configuration_qwen2_vl import ( Qwen2VLVisionConfig) -from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize +from transformers.models.qwen2_vl.image_processing_qwen2_vl import ( + make_batched_images, make_batched_videos, smart_resize) import vllm.envs as envs from vllm.attention import AttentionMetadata @@ -579,33 +583,62 @@ def mm_input_mapper_for_qwen2_vl( data_type_key="video") -def _get_max_image_info(image_processor, - data_type_key: str = "image", - mm_count: int = 1): - max_resized_height, max_resized_width = smart_resize( +def _get_vision_info( + image_processor, + height: int, + width: int, + min_pixels: int, + max_pixels: int, + do_resize: bool = True, + data_type_key: str = "image", + mm_count: int = 1, +): + """Get information (resized height / width and number of vision tokens) + of input image / video frame.""" + + if do_resize: + resized_height, resized_width = smart_resize( + height=height, + width=width, + factor=image_processor.patch_size * image_processor.merge_size, + min_pixels=min_pixels, + max_pixels=max_pixels, + ) + else: + resized_height, resized_width = height, width + + if data_type_key == "image": + grid_t = mm_count + else: + assert data_type_key == "video" + grid_t = max(mm_count // image_processor.temporal_patch_size, 1) + + grid_h = resized_height // image_processor.patch_size + grid_w = resized_width // image_processor.patch_size + vision_tokens = grid_t * grid_h * grid_w + llm_num_vision_tokens = (vision_tokens // image_processor.merge_size // + image_processor.merge_size) + + return resized_height, resized_width, llm_num_vision_tokens + + +def _get_max_image_info( + image_processor, + data_type_key: str = "image", + mm_count: int = 1, +): + return _get_vision_info( + image_processor, height=9999999, width=9999999, - factor=image_processor.patch_size * image_processor.merge_size, # Limit min / max pixels. min_pixels=max(image_processor.min_pixels, 28 * 28), max_pixels=min(image_processor.max_pixels, 1280 * 28 * 28), + data_type_key=data_type_key, + mm_count=mm_count, ) - if data_type_key == "image": - max_grid_t = mm_count - else: - assert data_type_key == "video" - max_grid_t = max(mm_count // image_processor.temporal_patch_size, 1) - - max_grid_h = max_resized_height // image_processor.patch_size - max_grid_w = max_resized_width // image_processor.patch_size - max_image_tokens = max_grid_t * max_grid_h * max_grid_w - max_llm_image_tokens = (max_image_tokens // image_processor.merge_size // - image_processor.merge_size) - - return max_resized_height, max_resized_width, max_llm_image_tokens - def get_max_qwen2_vl_mm_tokens(ctx: InputContext, data_type_key: str) -> int: image_processor = cached_get_image_processor(ctx.model_config.model) @@ -665,6 +698,32 @@ def dummy_data_for_qwen2_vl( } +def _get_llm_num_vision_tokens( + mm_inputs: list, + data_type_key: str, + image_processor, +): + """Get number of vision tokens of multimodal inputs. + + This method is derived from `transformers.models.qwen2_vl. + image_processing_qwen2_vl.Qwen2VLImageProcessor._preprocess`. + """ + image = to_numpy_array(mm_inputs[0]) + input_data_format = infer_channel_dimension_format(image) + height, width = get_image_size(image, channel_dim=input_data_format) + _, _, llm_num_vision_tokens = _get_vision_info( + image_processor, + height=height, + width=width, + min_pixels=image_processor.min_pixels, + max_pixels=image_processor.max_pixels, + do_resize=image_processor.do_resize, + data_type_key=data_type_key, + mm_count=len(mm_inputs), + ) + return llm_num_vision_tokens + + def input_processor_for_qwen2_vl(ctx: InputContext, llm_inputs: LLMInputs) -> LLMInputs: multi_modal_data = llm_inputs.get("multi_modal_data", None) @@ -675,20 +734,91 @@ def input_processor_for_qwen2_vl(ctx: InputContext, video_inputs = multi_modal_data.get("video", None) processor = cached_get_processor(ctx.model_config.model) + image_processor = processor.image_processor + hf_config = ctx.get_hf_config(Qwen2VLConfig) - prompt = llm_inputs["prompt"] - if prompt is None: - prompt_token_ids = llm_inputs["prompt_token_ids"] - prompt = processor.tokenizer.decode(prompt_token_ids) - - inputs = processor(text=[prompt], - images=image_inputs, - videos=video_inputs, - padding=True, - return_tensors="pt") + # To avoid redundant processing of vision objects (resize, rescale, etc.), + # we extract code of calculating number of vision tokens from + # `transformers.models.qwen2_vl.processing_qwen2_vl.Qwen2VLProcessor`. + # + # The following code is equivalent to: + # prompt = llm_inputs["prompt"] + # inputs = processor(text=[prompt], + # images=image_inputs, + # videos=video_inputs, + # padding=True, + # return_tensors="pt") + # prompt_token_ids = inputs["input_ids"][0].tolist() + + prompt_token_ids = llm_inputs.get("prompt_token_ids", None) + if prompt_token_ids is None: + prompt = llm_inputs["prompt"] + prompt_token_ids = processor.tokenizer( + prompt, + padding=True, + return_tensors=None, + )["input_ids"] + + # Expand image pad tokens. + if image_inputs is not None: + image_indices = [ + idx for idx, token in enumerate(prompt_token_ids) + if token == hf_config.image_token_id + ] + image_inputs = make_batched_images(image_inputs) + assert len(image_indices) == len(image_inputs) + + prompt_token_ids_with_image = [] + for image_cnt, image in enumerate(image_inputs): + num_image_tokens = _get_llm_num_vision_tokens( + [image], + data_type_key="image", + image_processor=image_processor, + ) + if image_cnt == 0: + non_image_tokens = prompt_token_ids[:image_indices[image_cnt]] + else: + non_image_tokens = prompt_token_ids[image_indices[image_cnt - + 1] + + 1:image_indices[image_cnt]] + prompt_token_ids_with_image.extend(non_image_tokens) + prompt_token_ids_with_image.extend( + hf_config.image_token_id for _ in range(num_image_tokens)) + prompt_token_ids_with_image.extend(prompt_token_ids[image_indices[-1] + + 1:]) + prompt_token_ids = prompt_token_ids_with_image + + # Expand video pad tokens. + if video_inputs is not None: + video_indices = [ + idx for idx, token in enumerate(prompt_token_ids) + if token == hf_config.video_token_id + ] + video_inputs = make_batched_videos(video_inputs) + assert len(video_indices) == len(video_inputs) + + prompt_token_ids_with_video = [] + for video_cnt, video in enumerate(video_inputs): + num_video_tokens = _get_llm_num_vision_tokens( + video, + data_type_key="video", + image_processor=image_processor, + ) + if video_cnt == 0: + non_video_tokens = prompt_token_ids[:video_indices[video_cnt]] + else: + non_video_tokens = prompt_token_ids[video_indices[video_cnt - + 1] + + 1:video_indices[video_cnt]] + prompt_token_ids_with_video.extend(non_video_tokens) + prompt_token_ids_with_video.extend( + hf_config.video_token_id for _ in range(num_video_tokens)) + prompt_token_ids_with_video.extend(prompt_token_ids[video_indices[-1] + + 1:]) + prompt_token_ids = prompt_token_ids_with_video return LLMInputs( - prompt_token_ids=inputs["input_ids"][0].tolist(), + prompt_token_ids=prompt_token_ids, prompt=llm_inputs["prompt"], multi_modal_data=multi_modal_data, ) @@ -712,6 +842,9 @@ def __init__(self, quant_config: Optional[QuantizationConfig] = None) -> None: super().__init__() + assert not cache_config.enable_prefix_caching, \ + "Qwen2-VL currently does not support prefix caching" + self.config = config self.multimodal_config = multimodal_config @@ -719,7 +852,8 @@ def __init__(self, config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), - # NOTE: Qwen2-VL does not support any quantization method now. + # NOTE: Qwen2-VL vision encoder does not support any + # quantization method now. quant_config=None, ) From c64c21720df6850de3cb4bf9d39236357d75ef6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Tue, 10 Sep 2024 23:34:23 +0800 Subject: [PATCH 30/34] Update model notes and requirements-common.txt. --- docs/source/models/supported_models.rst | 6 +++++- requirements-common.txt | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 498e63ab87437..917fe8d236b41 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -248,7 +248,7 @@ Multimodal Language Models - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. - * - :code:`Qwen2VLForConditionalGeneration` - - Qwen2-VL + - Qwen2-VL (see note) - Image\ :sup:`+` / Video\ :sup:`+` - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - @@ -265,6 +265,10 @@ Multimodal Language Models For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 +.. note:: + For Qwen2-VL, the latest version of :code:`transformers` official repo have a bug when loading model config, so we need to use a specific old version :code:`21fac7abba2a37fae86106f87fcf9974fd1e3830` for now. + For more details, please see: https://github.com/vllm-project/vllm/pull/7905#issuecomment-2339863055 + ---- If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. diff --git a/requirements-common.txt b/requirements-common.txt index 49a290317f818..4e008112c6cb0 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -28,3 +28,4 @@ importlib_metadata mistral_common >= 1.3.4 pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 +einops # Required for Qwen2-VL. From 6bdefd6e6c69544e9f9774fad0192587640f1ce4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Tue, 10 Sep 2024 23:39:58 +0800 Subject: [PATCH 31/34] Update model notes. --- docs/source/models/supported_models.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 917fe8d236b41..1fd1cb59d0a56 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -266,7 +266,7 @@ Multimodal Language Models For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 .. note:: - For Qwen2-VL, the latest version of :code:`transformers` official repo have a bug when loading model config, so we need to use a specific old version :code:`21fac7abba2a37fae86106f87fcf9974fd1e3830` for now. + For :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now. For more details, please see: https://github.com/vllm-project/vllm/pull/7905#issuecomment-2339863055 ---- From 33dd048d42ecc07f360ddf03be2e9083a560bf84 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 11 Sep 2024 02:47:42 +0000 Subject: [PATCH 32/34] Skip loading model --- tests/models/test_registry.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index b058e2755c245..b283d2deb1fbe 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -1,9 +1,14 @@ import pytest +import transformers from vllm.model_executor.models import _MODELS, ModelRegistry @pytest.mark.parametrize("model_cls", _MODELS) def test_registry_imports(model_cls): + if (model_cls == "Qwen2VLForConditionalGeneration" + and transformers.__version__ < "4.45"): + pytest.skip("Waiting for next transformers release") + # Ensure all model classes can be imported successfully ModelRegistry.resolve_model_cls([model_cls]) From 282c66ae4e6808981374a58b839a61b7297fb13e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 11 Sep 2024 03:02:19 +0000 Subject: [PATCH 33/34] format --- tests/models/test_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index b283d2deb1fbe..3930a5f465f70 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -7,7 +7,7 @@ @pytest.mark.parametrize("model_cls", _MODELS) def test_registry_imports(model_cls): if (model_cls == "Qwen2VLForConditionalGeneration" - and transformers.__version__ < "4.45"): + and transformers.__version__ < "4.45"): pytest.skip("Waiting for next transformers release") # Ensure all model classes can be imported successfully From 14ef94d02f7dc5b3013869edea884bd27c2f0829 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 11 Sep 2024 03:46:07 +0000 Subject: [PATCH 34/34] Increase `max_model_len` to fit the original image --- examples/offline_inference_vision_language_multi_image.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index 92d5c58ee6fd1..ed7e886d57806 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -78,7 +78,7 @@ def load_qwen2_vl(question, image_urls: List[str]): llm = LLM( model=model_name, max_num_seqs=5, - max_model_len=4096, + max_model_len=32768 if process_vision_info is None else 4096, limit_mm_per_prompt={"image": len(image_urls)}, )