From 4bd5aa9d39caec988b3e3efd0fdb4c4a74aca396 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Tue, 25 Jun 2024 15:11:47 +0800 Subject: [PATCH 01/47] [Model] Initialize deepseek-vl-7b-chat support --- vllm/model_executor/models/__init__.py | 1 + vllm/model_executor/models/deepseek_vl.py | 1901 +++++++++++++++++ vllm/transformers_utils/config.py | 4 +- vllm/transformers_utils/configs/__init__.py | 2 + .../transformers_utils/configs/deepseek_vl.py | 89 + 5 files changed, 1996 insertions(+), 1 deletion(-) create mode 100644 vllm/model_executor/models/deepseek_vl.py create mode 100644 vllm/transformers_utils/configs/deepseek_vl.py diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 5afb2e1d44d39..9d1da1ebc2643 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -21,6 +21,7 @@ "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"), "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"), + "MultiModalityCausalLM": ("deepseek_vl", "DeepSeekMultiModalityCausalLM"), "FalconForCausalLM": ("falcon", "FalconForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py new file mode 100644 index 0000000000000..88178d9d773a2 --- /dev/null +++ b/vllm/model_executor/models/deepseek_vl.py @@ -0,0 +1,1901 @@ +# Copyright (c) 2023-2024 DeepSeek. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import math +import warnings +import copy + +from dataclasses import dataclass +from functools import partial +from dataclasses import dataclass +from functools import partial +from typing import ( + Callable, + Dict, + Final, + List, + Literal, + Optional, + Sequence, + Set, + Tuple, + Type, + Union, +) + +import torch +import torch.nn as nn +import torchvision.transforms +import torch.nn.functional as F +import numpy as np +import torchvision +import torchvision.transforms.functional +import torch.nn.functional as F + + +from einops import rearrange +from transformers import PreTrainedModel +from transformers.configuration_utils import PretrainedConfig +from einops import rearrange +from PIL import Image +from transformers import AutoImageProcessor, PretrainedConfig +from transformers.image_processing_utils import BaseImageProcessor, BatchFeature +from transformers.image_utils import to_numpy_array +from einops import rearrange +from transformers import PreTrainedModel +from timm.layers import ( + AttentionPoolLatent, + DropPath, + LayerType, + Mlp, + PatchDropout, + PatchEmbed, + resample_abs_pos_embed, +) +from timm.models._manipulate import checkpoint_seq, named_apply + + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig, VisionLanguageConfig +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.llama import LlamaModel +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.image import get_dummy_image_data +from vllm.sequence import SamplerOutput +from .vlm_base import VisionLanguageModelBase +from vllm.transformers_utils.configs import DeepSeekMultiModalityConfig + + +ImageType = Union[np.ndarray, torch.Tensor, Image.Image] +IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073) +IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711) +IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) +IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5) + + +def expand2square(pil_img, background_color): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + + +class VLMImageProcessorConfig(PretrainedConfig): + model_type = "deepseek_vlm" + image_size: int + min_size: int + image_mean: Union[Tuple[float, float, float], List[float]] + image_std: Union[Tuple[float, float, float], List[float]] + rescale_factor: float + do_normalize: bool + + def __init__( + self, + image_size: int, + min_size: int = 14, + image_mean: Union[Tuple[float, float, float], List[float]] = ( + 0.48145466, + 0.4578275, + 0.40821073, + ), + image_std: Union[Tuple[float, float, float], List[float]] = ( + 0.26862954, + 0.26130258, + 0.27577711, + ), + rescale_factor: float = 1.0 / 255.0, + do_normalize: bool = True, + **kwargs, + ): + self.image_size = image_size + self.min_size = min_size + self.image_mean = image_mean + self.image_std = image_std + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + + super().__init__(**kwargs) + + +class VLMImageProcessor(BaseImageProcessor): + model_input_names = ["pixel_values"] + + def __init__( + self, + image_size: int, + min_size: int = 14, + image_mean: Union[Tuple[float, float, float], List[float]] = ( + 0.48145466, + 0.4578275, + 0.40821073, + ), + image_std: Union[Tuple[float, float, float], List[float]] = ( + 0.26862954, + 0.26130258, + 0.27577711, + ), + rescale_factor: float = 1.0 / 255.0, + do_normalize: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + + self.image_size = image_size + self.rescale_factor = rescale_factor + self.image_mean = image_mean + self.image_std = image_std + self.min_size = min_size + self.do_normalize = do_normalize + + if image_mean is None: + self.background_color = (127, 127, 127) + else: + self.background_color = tuple([int(x * 255) for x in image_mean]) + + def resize(self, pil_img: Image) -> np.ndarray: + """ + + Args: + pil_img (PIL.Image): [H, W, 3] in PIL.Image in RGB + + Returns: + x (np.ndarray): [3, self.image_size, self.image_size] + """ + + width, height = pil_img.size + max_size = max(width, height) + + size = [ + max(int(height / max_size * self.image_size), self.min_size), + max(int(width / max_size * self.image_size), self.min_size), + ] + + if width <= 0 or height <= 0 or size[0] <= 0 or size[1] <= 0: + print(f"orig size = {pil_img.size}, new size = {size}") + raise ValueError("Invalid size!") + + pil_img = torchvision.transforms.functional.resize( + pil_img, + size, + interpolation=torchvision.transforms.functional.InterpolationMode.BICUBIC, + antialias=True, + ) + + pil_img = expand2square(pil_img, self.background_color) + x = to_numpy_array(pil_img) + + # [H, W, 3] -> [3, H, W] + x = np.transpose(x, (2, 0, 1)) + + return x + + def preprocess(self, images, return_tensors: str = "pt", **kwargs) -> BatchFeature: + # resize and pad to [self.image_size, self.image_size] + # then convert from [H, W, 3] to [3, H, W] + # print(images) + if not isinstance(images, List): + images = [ + images, + ] + images: List[np.ndarray] = [self.resize(image) for image in images] + + # resacle from [0, 255] -> [0, 1] + images = [ + self.rescale( + image=image, + scale=self.rescale_factor, + input_data_format="channels_first", + ) + for image in images + ] + + # normalize + if self.do_normalize: + images = [ + self.normalize( + image=image, + mean=self.image_mean, + std=self.image_std, + input_data_format="channels_first", + ) + for image in images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + @property + def default_shape(self): + return [3, self.image_size, self.image_size] + + +class MlpProjector(nn.Module): + def __init__(self, cfg): + super().__init__() + + self.cfg = cfg + + if cfg.projector_type == "identity": + modules = nn.Identity() + + elif cfg.projector_type == "linear": + modules = nn.Linear(cfg.input_dim, cfg.n_embed) + + elif cfg.projector_type == "mlp_gelu": + mlp_depth = cfg.get("depth", 1) + modules = [nn.Linear(cfg.input_dim, cfg.n_embed)] + for _ in range(1, mlp_depth): + modules.append(nn.GELU()) + modules.append(nn.Linear(cfg.n_embed, cfg.n_embed)) + modules = nn.Sequential(*modules) + + elif cfg.projector_type == "low_high_hybrid_split_mlp_gelu": + mlp_depth = cfg.get("depth", 1) + self.high_up_proj = nn.Linear(cfg.input_dim, cfg.n_embed // 2) + self.low_up_proj = nn.Linear(cfg.input_dim, cfg.n_embed // 2) + + modules = [] + for _ in range(1, mlp_depth): + modules.append(nn.GELU()) + modules.append(nn.Linear(cfg.n_embed, cfg.n_embed)) + modules = nn.Sequential(*modules) + + else: + raise ValueError(f"Unknown projector type: {cfg.projector_type}") + + self.layers = modules + + def forward( + self, x_or_tuple: Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor] + ): + """ + + Args: + x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: if it is a tuple of torch.Tensor, + then it comes from the hybrid vision encoder, and x = high_res_x, low_res_x); + otherwise it is the feature from the single vision encoder. + + Returns: + x (torch.Tensor): [b, s, c] + """ + + if isinstance(x_or_tuple, tuple): + # self.cfg.projector_type == "low_high_hybrid_split_mlp_gelu": + high_x, low_x = x_or_tuple + high_x = self.high_up_proj(high_x) + low_x = self.low_up_proj(low_x) + x = torch.concat([high_x, low_x], dim=-1) + else: + x = x_or_tuple + + return self.layers(x) + + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2, + ) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) # noqa: E741 + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.0)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): + # type: (torch.Tensor, float, float, float, float) -> torch.Tensor + r"""The original timm.models.layers.weight_init.trunc_normal_ can not handle bfloat16 yet, here we first + convert the tensor to float32, apply the trunc_normal_() in float32, and then convert it back to its orignal dtype. + Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn + from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + Examples: + >>> w = torch.empty(3, 5) + >>> nn.init.trunc_normal_(w) + """ + + with torch.no_grad(): + dtype = tensor.dtype + tensor_fp32 = tensor.float() + tensor_fp32 = _no_grad_trunc_normal_(tensor_fp32, mean, std, a, b) + tensor_dtype = tensor_fp32.to(dtype=dtype) + tensor.copy_(tensor_dtype) + + +def init_weights(self): + if self.pos_embed is not None: + trunc_normal_(self.pos_embed, std=self.pos_embed.shape[1] ** -0.5) + trunc_normal_(self.latent, std=self.latent_dim**-0.5) + + +def init_weights_vit_timm(module: nn.Module, name: str = "") -> None: + """ViT weight initialization, original timm impl (for reproducibility)""" + if isinstance(module, nn.Linear): + trunc_normal_(module.weight, std=0.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif hasattr(module, "init_weights"): + module.init_weights() + + +class SigLipAttention(nn.Module): + fused_attn: Final[bool] + + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + qk_norm: bool = False, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = self.head_dim**-0.5 + # self.fused_attn = use_fused_attn() + self.fused_attn = True + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) if proj_drop > 0.0 else nn.Identity() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B, N, 3, self.num_heads, self.head_dim) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = qkv.unbind(0) + q, k = self.q_norm(q), self.k_norm(k) + + if self.fused_attn: + x = F.scaled_dot_product_attention( + q, + k, + v, + dropout_p=self.attn_drop.p if self.training else 0.0, + ) + else: + q = q * self.scale + attn = q @ k.transpose(-2, -1) + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + x = attn @ v + + x = x.transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class LayerScale(nn.Module): + def __init__( + self, + dim: int, + init_values: float = 1e-5, + inplace: bool = False, + ) -> None: + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x.mul_(self.gamma) if self.inplace else x * self.gamma + + +class SigLipBlock(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + qk_norm: bool = False, + proj_drop: float = 0.0, + attn_drop: float = 0.0, + init_values: Optional[float] = None, + drop_path: float = 0.0, + act_layer: nn.Module = nn.GELU, + norm_layer: nn.Module = nn.LayerNorm, + mlp_layer: nn.Module = Mlp, + ) -> None: + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = SigLipAttention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_norm=qk_norm, + attn_drop=attn_drop, + proj_drop=proj_drop, + norm_layer=norm_layer, + ) + self.ls1 = ( + LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + ) + self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = norm_layer(dim) + self.mlp = mlp_layer( + in_features=dim, + hidden_features=int(dim * mlp_ratio), + act_layer=act_layer, + drop=proj_drop, + ) + self.ls2 = ( + LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + ) + self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x)))) + x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x)))) + return x + + +class VisionTransformer(nn.Module): + """Vision Transformer + + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` + - https://arxiv.org/abs/2010.11929 + """ + + dynamic_img_size: Final[bool] + + def __init__( + self, + img_size: Union[int, Tuple[int, int]] = 224, + patch_size: Union[int, Tuple[int, int]] = 16, + in_chans: int = 3, + num_classes: int = 1000, + global_pool: Literal["", "avg", "token", "map"] = "token", + embed_dim: int = 768, + depth: int = 12, + num_heads: int = 12, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + qk_norm: bool = False, + init_values: Optional[float] = None, + class_token: bool = True, + no_embed_class: bool = False, + reg_tokens: int = 0, + pre_norm: bool = False, + fc_norm: Optional[bool] = None, + dynamic_img_size: bool = False, + dynamic_img_pad: bool = False, + drop_rate: float = 0.0, + pos_drop_rate: float = 0.0, + patch_drop_rate: float = 0.0, + proj_drop_rate: float = 0.0, + attn_drop_rate: float = 0.0, + drop_path_rate: float = 0.0, + weight_init: Literal["skip", "jax", "jax_nlhb", "moco", ""] = "", + embed_layer: Callable = PatchEmbed, + norm_layer: Optional[LayerType] = None, + act_layer: Optional[LayerType] = None, + block_fn: Type[nn.Module] = SigLipBlock, + mlp_layer: Type[nn.Module] = Mlp, + ignore_head: bool = False, + ) -> None: + """ + Args: + img_size: Input image size. + patch_size: Patch size. + in_chans: Number of image input channels. + num_classes: Mumber of classes for classification head. + global_pool: Type of global pooling for final sequence (default: 'token'). + embed_dim: Transformer embedding dimension. + depth: Depth of transformer. + num_heads: Number of attention heads. + mlp_ratio: Ratio of mlp hidden dim to embedding dim. + qkv_bias: Enable bias for qkv projections if True. + init_values: Layer-scale init values (layer-scale enabled if not None). + class_token: Use class token. + no_embed_class: Don't include position embeddings for class (or reg) tokens. + reg_tokens: Number of register tokens. + fc_norm: Pre head norm after pool (instead of before), if None, enabled when global_pool == 'avg'. + drop_rate: Head dropout rate. + pos_drop_rate: Position embedding dropout rate. + attn_drop_rate: Attention dropout rate. + drop_path_rate: Stochastic depth rate. + weight_init: Weight initialization scheme. + embed_layer: Patch embedding layer. + norm_layer: Normalization layer. + act_layer: MLP activation layer. + block_fn: Transformer block layer. + """ + super().__init__() + assert global_pool in ("", "avg", "token", "map") + assert class_token or global_pool != "token" + use_fc_norm = global_pool == "avg" if fc_norm is None else fc_norm + # norm_layer = get_norm_layer(norm_layer) or partial(nn.LayerNorm, eps=1e-6) + # act_layer = get_act_layer(act_layer) or nn.GELU + norm_layer = partial(nn.LayerNorm, eps=1e-6) + act_layer = nn.GELU + + self.num_classes = num_classes + self.global_pool = global_pool + self.num_features = self.embed_dim = ( + embed_dim # num_features for consistency with other models + ) + self.num_prefix_tokens = 1 if class_token else 0 + self.num_prefix_tokens += reg_tokens + self.num_reg_tokens = reg_tokens + self.has_class_token = class_token + self.no_embed_class = ( + no_embed_class # don't embed prefix positions (includes reg) + ) + self.dynamic_img_size = dynamic_img_size + self.grad_checkpointing = False + self.ignore_head = ignore_head + + embed_args = {} + if dynamic_img_size: + # flatten deferred until after pos embed + embed_args.update(dict(strict_img_size=False, output_fmt="NHWC")) + self.patch_embed = embed_layer( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + bias=not pre_norm, # disable bias if pre-norm is used (e.g. CLIP) + dynamic_img_pad=dynamic_img_pad, + **embed_args, + ) + num_patches = self.patch_embed.num_patches + + self.cls_token = ( + nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None + ) + self.reg_token = ( + nn.Parameter(torch.zeros(1, reg_tokens, embed_dim)) if reg_tokens else None + ) + embed_len = ( + num_patches if no_embed_class else num_patches + self.num_prefix_tokens + ) + self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02) + self.pos_drop = nn.Dropout(p=pos_drop_rate) + if patch_drop_rate > 0: + self.patch_drop = PatchDropout( + patch_drop_rate, + num_prefix_tokens=self.num_prefix_tokens, + ) + else: + self.patch_drop = nn.Identity() + self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity() + + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + self.blocks = nn.Sequential( + *[ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_norm=qk_norm, + init_values=init_values, + proj_drop=proj_drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + mlp_layer=mlp_layer, + ) + for i in range(depth) + ] + ) + self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity() + + # Classifier Head + if global_pool == "map": + AttentionPoolLatent.init_weights = init_weights + self.attn_pool = AttentionPoolLatent( + self.embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + norm_layer=norm_layer, + ) + else: + self.attn_pool = None + self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity() + self.head_drop = nn.Dropout(drop_rate) + self.head = ( + nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + ) + + if weight_init != "skip": + self.init_weights(weight_init) + + def init_weights(self, mode: Literal["jax", "jax_nlhb", "moco", ""] = "") -> None: + assert mode in ("jax", "jax_nlhb", "moco", "") + # head_bias = -math.log(self.num_classes) if "nlhb" in mode else 0.0 + trunc_normal_(self.pos_embed, std=0.02) + if self.cls_token is not None: + nn.init.normal_(self.cls_token, std=1e-6) + named_apply(init_weights_vit_timm, self) + + @torch.jit.ignore + def no_weight_decay(self) -> Set: + return {"pos_embed", "cls_token", "dist_token"} + + @torch.jit.ignore + def group_matcher(self, coarse: bool = False) -> Dict: + return dict( + stem=r"^cls_token|pos_embed|patch_embed", # stem and embed + blocks=[(r"^blocks\.(\d+)", None), (r"^norm", (99999,))], + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable: bool = True) -> None: + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self) -> nn.Module: + return self.head + + def reset_classifier(self, num_classes: int, global_pool=None) -> None: + self.num_classes = num_classes + if global_pool is not None: + assert global_pool in ("", "avg", "token", "map") + if global_pool == "map" and self.attn_pool is None: + assert ( + False + ), "Cannot currently add attention pooling in reset_classifier()." + elif global_pool != "map " and self.attn_pool is not None: + self.attn_pool = None # remove attention pooling + self.global_pool = global_pool + self.head = ( + nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + ) + + def _pos_embed(self, x: torch.Tensor) -> torch.Tensor: + if self.dynamic_img_size: + B, H, W, C = x.shape + pos_embed = resample_abs_pos_embed( + self.pos_embed, + (H, W), + num_prefix_tokens=0 if self.no_embed_class else self.num_prefix_tokens, + ) + x = x.view(B, -1, C) + else: + pos_embed = self.pos_embed + + to_cat = [] + if self.cls_token is not None: + to_cat.append(self.cls_token.expand(x.shape[0], -1, -1)) + if self.reg_token is not None: + to_cat.append(self.reg_token.expand(x.shape[0], -1, -1)) + + if self.no_embed_class: + # deit-3, updated JAX (big vision) + # position embedding does not overlap with class token, add then concat + x = x + pos_embed + if to_cat: + x = torch.cat(to_cat + [x], dim=1) + else: + # original timm, JAX, and deit vit impl + # pos_embed has entry for class token, concat then add + if to_cat: + x = torch.cat(to_cat + [x], dim=1) + x = x + pos_embed + + return self.pos_drop(x) + + def _intermediate_layers( + self, + x: torch.Tensor, + n: Union[int, Sequence] = 1, + ) -> List[torch.Tensor]: + outputs, num_blocks = [], len(self.blocks) + take_indices = set( + range(num_blocks - n, num_blocks) if isinstance(n, int) else n + ) + + # forward pass + x = self.patch_embed(x) + x = self._pos_embed(x) + x = self.patch_drop(x) + x = self.norm_pre(x) + for i, blk in enumerate(self.blocks): + x = blk(x) + if i in take_indices: + outputs.append(x) + + return outputs + + def get_intermediate_layers( + self, + x: torch.Tensor, + n: Union[int, Sequence] = 1, + reshape: bool = False, + return_prefix_tokens: bool = False, + norm: bool = False, + ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]: + """Intermediate layer accessor (NOTE: This is a WIP experiment). + Inspired by DINO / DINOv2 interface + """ + # take last n blocks if n is an int, if in is a sequence, select by matching indices + outputs = self._intermediate_layers(x, n) + if norm: + outputs = [self.norm(out) for out in outputs] + prefix_tokens = [out[:, 0 : self.num_prefix_tokens] for out in outputs] + outputs = [out[:, self.num_prefix_tokens :] for out in outputs] + + if reshape: + grid_size = self.patch_embed.grid_size + outputs = [ + out.reshape(x.shape[0], grid_size[0], grid_size[1], -1) + .permute(0, 3, 1, 2) + .contiguous() + for out in outputs + ] + + if return_prefix_tokens: + return tuple(zip(outputs, prefix_tokens)) + return tuple(outputs) + + def forward_features(self, x: torch.Tensor) -> torch.Tensor: + x = self.patch_embed(x) + x = self._pos_embed(x) + x = self.patch_drop(x) + x = self.norm_pre(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x) + else: + x = self.blocks(x) + x = self.norm(x) + return x + + def forward_head(self, x: torch.Tensor, pre_logits: bool = False) -> torch.Tensor: + if self.attn_pool is not None: + x = self.attn_pool(x) + elif self.global_pool == "avg": + x = x[:, self.num_prefix_tokens :].mean(dim=1) + elif self.global_pool: + x = x[:, 0] # class token + x = self.fc_norm(x) + x = self.head_drop(x) + return x if pre_logits else self.head(x) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.forward_features(x) + if not self.ignore_head: + x = self.forward_head(x) + return x + + +@dataclass +class SigLIPVisionCfg: + width: int = 1152 + layers: Union[Tuple[int, int, int, int], int] = 27 + heads: int = 16 + patch_size: int = 14 + image_size: Union[Tuple[int, int], int] = 336 + global_pool: str = "map" + mlp_ratio: float = 3.7362 + class_token: bool = False + num_classes: int = 0 + use_checkpoint: bool = False + + +SigLIP_MODEL_CONFIG = { + "siglip_so400m_patch14_384": { + "image_size": 336, + "patch_size": 14, + "width": 1152, + "layers": 27, + "heads": 16, + "mlp_ratio": 3.7362, + "global_pool": "map", + "use_checkpoint": False, + }, + "siglip_so400m_patch14_224": { + "image_size": 224, + "patch_size": 14, + "width": 1152, + "layers": 27, + "heads": 16, + "mlp_ratio": 3.7362, + "global_pool": "map", + "use_checkpoint": False, + }, + "siglip_large_patch16_384": { + "image_size": 384, + "patch_size": 16, + "width": 1024, + "layers": 24, + "heads": 16, + "mlp_ratio": 4, + "global_pool": "map", + "use_checkpoint": False, + }, +} + + +def create_siglip_vit( + model_name: str = "siglip_so400m_patch14_384", + image_size: int = 384, + select_layer: int = -1, + ckpt_path: str = "", + **kwargs, +): + assert ( + model_name in SigLIP_MODEL_CONFIG.keys() + ), f"model name should be in {SigLIP_MODEL_CONFIG.keys()}" + + vision_cfg = SigLIPVisionCfg(**SigLIP_MODEL_CONFIG[model_name]) + + if select_layer <= 0: + layers = min(vision_cfg.layers, vision_cfg.layers + select_layer + 1) + else: + layers = min(vision_cfg.layers, select_layer) + + model = VisionTransformer( + img_size=image_size, + patch_size=vision_cfg.patch_size, + embed_dim=vision_cfg.width, + depth=layers, + num_heads=vision_cfg.heads, + mlp_ratio=vision_cfg.mlp_ratio, + class_token=vision_cfg.class_token, + global_pool=vision_cfg.global_pool, + ignore_head=kwargs.get("ignore_head", True), + weight_init=kwargs.get("weight_init", "skip"), + num_classes=0, + ) + + if ckpt_path: + state_dict = torch.load(ckpt_path, map_location="cpu") + + incompatible_keys = model.load_state_dict(state_dict, strict=False) + print( + f"SigLIP-ViT restores from {ckpt_path},\n" + f"\tincompatible_keys:', {incompatible_keys}." + ) + + return model + + +class MLPBlock(nn.Module): + def __init__( + self, + embedding_dim: int, + mlp_dim: int, + act: Type[nn.Module] = nn.GELU, + ) -> None: + super().__init__() + self.lin1 = nn.Linear(embedding_dim, mlp_dim) + self.lin2 = nn.Linear(mlp_dim, embedding_dim) + self.act = act() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.lin2(self.act(self.lin1(x))) + + +# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa +# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa +class LayerNorm2d(nn.Module): + def __init__(self, num_channels: int, eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(num_channels)) + self.bias = nn.Parameter(torch.zeros(num_channels)) + self.eps = eps + + def forward(self, x: torch.Tensor) -> torch.Tensor: + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa +class ImageEncoderViT(nn.Module): + def __init__( + self, + img_size: int = 1024, + patch_size: int = 16, + in_chans: int = 3, + embed_dim: int = 768, + depth: int = 12, + num_heads: int = 12, + mlp_ratio: float = 4.0, + out_chans: int = 256, + qkv_bias: bool = True, + norm_layer: Type[nn.Module] = nn.LayerNorm, + act_layer: Type[nn.Module] = nn.GELU, + use_abs_pos: bool = True, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + window_size: int = 0, + global_attn_indexes: Tuple[int, ...] = (), + downsample_channels: Tuple[int, ...] = (512, 1024), + ) -> None: + """ + Args: + img_size (int): Input image size. + patch_size (int): Patch size. + in_chans (int): Number of input image channels. + embed_dim (int): Patch embedding dimension. + depth (int): Depth of ViT. + num_heads (int): Number of attention heads in each ViT block. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + use_abs_pos (bool): If True, use absolute positional embeddings. + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + window_size (int): Window size for window attention blocks. + global_attn_indexes (list): Indexes for blocks using global attention. + downsample_channels (list): Channels for downsampling layers. + """ + super().__init__() + self.img_size = img_size + + self.patch_embed = PatchEmbed( + kernel_size=(patch_size, patch_size), + stride=(patch_size, patch_size), + in_chans=in_chans, + embed_dim=embed_dim, + ) + + self.pos_embed: Optional[nn.Parameter] = None + if use_abs_pos: + # Initialize absolute positional embedding with pretrain image size. + self.pos_embed = nn.Parameter( + torch.zeros( + 1, img_size // patch_size, img_size // patch_size, embed_dim + ) + ) + + self.blocks = nn.ModuleList() + for i in range(depth): + block = Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + act_layer=act_layer, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + window_size=window_size if i not in global_attn_indexes else 0, + input_size=(img_size // patch_size, img_size // patch_size), + ) + self.blocks.append(block) + + self.neck = nn.Sequential( + nn.Conv2d( + embed_dim, + out_chans, + kernel_size=1, + bias=False, + ), + LayerNorm2d(out_chans), + nn.Conv2d( + out_chans, + out_chans, + kernel_size=3, + padding=1, + bias=False, + ), + LayerNorm2d(out_chans), + ) + + in_channels = out_chans + downsamples = [] + for i in range(len(downsample_channels)): + out_channels = downsample_channels[i] + downsamples.append( + nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + bias=False, + ) + ) + in_channels = out_channels + self.downsamples = nn.Sequential(*downsamples) + + self.sam_hd = True + if self.sam_hd: + self.hd_alpha_downsamples = nn.Parameter(torch.zeros(1)) + # self.neck_hd = nn.Linear(embed_dim, embed_dim) + self.neck_hd = copy.deepcopy(self.neck) + # self.downsamples_hd = copy.deepcopy(self.downsamples) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.patch_embed(x) + if self.pos_embed is not None: + x = x + self.pos_embed + + global_features = [] + for i, blk in enumerate(self.blocks): + x = blk(x) + if self.sam_hd and blk.window_size == 0: + global_features.append(x) + + x = self.neck(x.permute(0, 3, 1, 2)) + x_dtype = x.dtype + x = F.interpolate( + x.float(), size=(96, 96), mode="bilinear", align_corners=False + ).to(x_dtype) + x = self.downsamples(x) + + if self.sam_hd: + first_global_feature = self.neck_hd(global_features[0].permute(0, 3, 1, 2)) + x_dtype = first_global_feature.dtype + first_global_feature = F.interpolate( + first_global_feature.float(), + size=(96, 96), + mode="bilinear", + align_corners=False, + ) + first_global_feature = self.downsamples(first_global_feature.to(x_dtype)) + x = x + first_global_feature * self.hd_alpha_downsamples + + return x + + +class Block(nn.Module): + """Transformer blocks with support of window attention and residual propagation blocks""" + + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + norm_layer: Type[nn.Module] = nn.LayerNorm, + act_layer: Type[nn.Module] = nn.GELU, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + window_size: int = 0, + input_size: Optional[Tuple[int, int]] = None, + ) -> None: + """ + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads in each ViT block. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + window_size (int): Window size for window attention blocks. If it equals 0, then + use global attention. + input_size (tuple(int, int) or None): Input resolution for calculating the relative + positional parameter size. + """ + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + input_size=input_size if window_size == 0 else (window_size, window_size), + ) + + self.norm2 = norm_layer(dim) + self.mlp = MLPBlock( + embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer + ) + + self.window_size = window_size + + def forward(self, x: torch.Tensor) -> torch.Tensor: + shortcut = x + x = self.norm1(x) + # Window partition + if self.window_size > 0: + H, W = x.shape[1], x.shape[2] + x, pad_hw = window_partition(x, self.window_size) + + x = self.attn(x) + # Reverse window partition + if self.window_size > 0: + x = window_unpartition(x, self.window_size, pad_hw, (H, W)) + + x = shortcut + x + x = x + self.mlp(self.norm2(x)) + + return x + + +class Attention(nn.Module): + """Multi-head Attention block with relative position embeddings.""" + + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = True, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + input_size: Optional[Tuple[int, int]] = None, + ) -> None: + """ + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + input_size (tuple(int, int) or None): Input resolution for calculating the relative + positional parameter size. + """ + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.proj = nn.Linear(dim, dim) + + self.use_rel_pos = use_rel_pos + if self.use_rel_pos: + assert ( + input_size is not None + ), "Input size must be provided if using relative positional encoding." + # initialize relative positional embeddings + self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim)) + self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, H, W, _ = x.shape + # qkv with shape (3, B, nHead, H * W, C) + qkv = ( + self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + ) + # q, k, v with shape (B * nHead, H * W, C) + q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0) + + def do_attention(q, k, v): + attn = (q * self.scale) @ k.transpose(-2, -1) + if self.use_rel_pos: + attn = add_decomposed_rel_pos( + attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W) + ) + + attn = attn.softmax(dim=-1) + x = ( + (attn @ v) + .view(B, self.num_heads, H, W, -1) + .permute(0, 2, 3, 1, 4) + .reshape(B, H, W, -1) + ) + + return x + + # from haiscale.utils import on_demand_checkpoint + # x = on_demand_checkpoint(do_attention, q, k, v) + x = do_attention(q, k, v) + x = self.proj(x) + + return x + + +def window_partition( + x: torch.Tensor, window_size: int +) -> Tuple[torch.Tensor, Tuple[int, int]]: + """ + Partition into non-overlapping windows with padding if needed. + Args: + x (tensor): input tokens with [B, H, W, C]. + window_size (int): window size. + + Returns: + windows: windows after partition with [B * num_windows, window_size, window_size, C]. + (Hp, Wp): padded height and width before partition + """ + B, H, W, C = x.shape + + pad_h = (window_size - H % window_size) % window_size + pad_w = (window_size - W % window_size) % window_size + if pad_h > 0 or pad_w > 0: + x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) + Hp, Wp = H + pad_h, W + pad_w + + x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) + windows = ( + x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + ) + return windows, (Hp, Wp) + + +def window_unpartition( + windows: torch.Tensor, + window_size: int, + pad_hw: Tuple[int, int], + hw: Tuple[int, int], +) -> torch.Tensor: + """ + Window unpartition into original sequences and removing padding. + Args: + windows (tensor): input tokens with [B * num_windows, window_size, window_size, C]. + window_size (int): window size. + pad_hw (Tuple): padded height and width (Hp, Wp). + hw (Tuple): original height and width (H, W) before padding. + + Returns: + x: unpartitioned sequences with [B, H, W, C]. + """ + Hp, Wp = pad_hw + H, W = hw + B = windows.shape[0] // (Hp * Wp // window_size // window_size) + x = windows.view( + B, Hp // window_size, Wp // window_size, window_size, window_size, -1 + ) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) + + if Hp > H or Wp > W: + x = x[:, :H, :W, :].contiguous() + return x + + +def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor: + """ + Get relative positional embeddings according to the relative positions of + query and key sizes. + Args: + q_size (int): size of query q. + k_size (int): size of key k. + rel_pos (Tensor): relative position embeddings (L, C). + + Returns: + Extracted positional embeddings according to relative positions. + """ + max_rel_dist = int(2 * max(q_size, k_size) - 1) + # Interpolate rel pos if needed. + if rel_pos.shape[0] != max_rel_dist: + # Interpolate rel pos. + rel_pos_resized = F.interpolate( + rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), + size=max_rel_dist, + mode="linear", + ) + rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) + else: + rel_pos_resized = rel_pos + + # Scale the coords with short length if shapes for q and k are different. + q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) + k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) + relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) + + return rel_pos_resized[relative_coords.long()] + + +def add_decomposed_rel_pos( + attn: torch.Tensor, + q: torch.Tensor, + rel_pos_h: torch.Tensor, + rel_pos_w: torch.Tensor, + q_size: Tuple[int, int], + k_size: Tuple[int, int], +) -> torch.Tensor: + """ + Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. + https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950 + Args: + attn (Tensor): attention map. + q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). + rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis. + rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis. + q_size (Tuple): spatial sequence size of query q with (q_h, q_w). + k_size (Tuple): spatial sequence size of key k with (k_h, k_w). + + Returns: + attn (Tensor): attention map with added relative positional embeddings. + """ + q_h, q_w = q_size + k_h, k_w = k_size + Rh = get_rel_pos(q_h, k_h, rel_pos_h) + Rw = get_rel_pos(q_w, k_w, rel_pos_w) + + B, _, dim = q.shape + r_q = q.reshape(B, q_h, q_w, dim) + rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) + rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) + + attn = ( + attn.view(B, q_h, q_w, k_h, k_w) + + rel_h[:, :, :, :, None] + + rel_w[:, :, :, None, :] + ).view(B, q_h * q_w, k_h * k_w) + + return attn + + +class PatchEmbed(nn.Module): + """ + Image to Patch Embedding. + """ + + def __init__( + self, + kernel_size: Tuple[int, int] = (16, 16), + stride: Tuple[int, int] = (16, 16), + padding: Tuple[int, int] = (0, 0), + in_chans: int = 3, + embed_dim: int = 768, + ) -> None: + """ + Args: + kernel_size (Tuple): kernel size of the projection layer. + stride (Tuple): stride of the projection layer. + padding (Tuple): padding size of the projection layer. + in_chans (int): Number of input image channels. + embed_dim (int): Patch embedding dimension. + """ + super().__init__() + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.proj(x) + # B C H W -> B H W C + x = x.permute(0, 2, 3, 1) + return x + + +@dataclass +class SAMViTCfg: + image_size: Union[Tuple[int, int], int] = 1024 + width: int = 1024 + layers: int = 23 + heads: int = 16 + patch_size: int = 16 + window_size: int = 14 + prompt_embed_dim: int = 256 + global_attn_indexes: Union[List[int], Tuple[int]] = (5, 11, 17, 23) + downsample_channels: Union[List[int], Tuple[int]] = (512, 1024) + + +SAM_MODEL_CONFIG = { + "sam_vit_b": { + "width": 768, + "layers": 12, + "heads": 12, + "global_attn_indexes": [2, 5, 8, 11], + "downsample_channels": (), + }, + "sam_b_downsample": { + "width": 768, + "layers": 12, + "heads": 12, + "global_attn_indexes": [2, 5, 8, 11], + "downsample_channels": (512, 1024), + }, + "sam_vit_l": { + "width": 1024, + "layers": 24, + "heads": 16, + "global_attn_indexes": [5, 11, 17, 23], + "downsample_channels": (), + }, + "sam_vit_h": { + "width": 1280, + "layers": 32, + "heads": 16, + "global_attn_indexes": [7, 15, 23, 31], + "downsample_channels": (), + }, +} + + +def create_sam_vit( + model_name: str = "sam_b_downsample", + image_size: int = 1024, + ckpt_path: str = "", + **kwargs, +): + assert ( + model_name in SAM_MODEL_CONFIG.keys() + ), f"model name: {model_name} should be in {SAM_MODEL_CONFIG.keys()}" + + sam_cfg = SAMViTCfg(**SAM_MODEL_CONFIG[model_name]) + image_encoder = ImageEncoderViT( + depth=sam_cfg.layers, + embed_dim=sam_cfg.width, + img_size=image_size, + mlp_ratio=4, + norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), + num_heads=sam_cfg.heads, + patch_size=sam_cfg.patch_size, + qkv_bias=True, + use_rel_pos=True, + global_attn_indexes=sam_cfg.global_attn_indexes, + window_size=14, + out_chans=sam_cfg.prompt_embed_dim, + downsample_channels=sam_cfg.downsample_channels, + ) + + if ckpt_path: + state_dict = torch.load(ckpt_path) + image_encoder.load_state_dict(state_dict, strict=False) + print(f"SAM-ViT restores from {ckpt_path}") + + return image_encoder + + +class CLIPVisionTower(nn.Module): + def __init__( + self, + model_name: str = "siglip_large_patch16_384", + image_size: Union[Tuple[int, int], int] = 336, + select_feature: str = "patch", + select_layer: int = -2, + select_layers: list = None, + ckpt_path: str = "", + pixel_mean: Optional[List[float]] = None, + pixel_std: Optional[List[float]] = None, + **kwargs, + ): + super().__init__() + + self.model_name = model_name + self.select_feature = select_feature + self.select_layer = select_layer + self.select_layers = select_layers + + vision_tower_params = { + "model_name": model_name, + "image_size": image_size, + "ckpt_path": ckpt_path, + "select_layer": select_layer, + } + vision_tower_params.update(kwargs) + self.vision_tower, self.forward_kwargs = self.build_vision_tower( + vision_tower_params + ) + + if pixel_mean is not None and pixel_std is not None: + image_norm = torchvision.transforms.Normalize( + mean=pixel_mean, std=pixel_std + ) + else: + image_norm = None + + self.image_norm = image_norm + + def build_vision_tower(self, vision_tower_params): + if self.model_name.startswith("siglip"): + self.select_feature = "same" + vision_tower = create_siglip_vit(**vision_tower_params) + forward_kwargs = dict() + + elif self.model_name.startswith("sam"): + vision_tower = create_sam_vit(**vision_tower_params) + forward_kwargs = dict() + + else: # huggingface + from transformers import CLIPVisionModel + + vision_tower = CLIPVisionModel.from_pretrained(**vision_tower_params) + forward_kwargs = dict(output_hidden_states=True) + + return vision_tower, forward_kwargs + + def feature_select(self, image_forward_outs): + if isinstance(image_forward_outs, torch.Tensor): + # the output has been the self.select_layer"s features + image_features = image_forward_outs + else: + image_features = image_forward_outs.hidden_states[self.select_layer] + + if self.select_feature == "patch": + # if the output has cls_token + image_features = image_features[:, 1:] + elif self.select_feature == "cls_patch": + image_features = image_features + elif self.select_feature == "same": + image_features = image_features + + else: + raise ValueError(f"Unexpected select feature: {self.select_feature}") + return image_features + + def forward(self, images): + """ + + Args: + images (torch.Tensor): [b, 3, H, W] + + Returns: + image_features (torch.Tensor): [b, n_patch, d] + """ + + if self.image_norm is not None: + images = self.image_norm(images) + + image_forward_outs = self.vision_tower(images, **self.forward_kwargs) + image_features = self.feature_select(image_forward_outs) + return image_features + + +class HybridVisionTower(nn.Module): + def __init__( + self, + high_res_cfg: Dict, + low_res_cfg: Dict, + freeze_high: bool = False, + freeze_low: bool = False, + concat_type: Literal["feature", "sequence", "add", "tuple"] = "tuple", + **ignore_kwargs, + ): + super().__init__() + + self.vision_tower_high = CLIPVisionTower(**high_res_cfg) + self.vision_tower_low = CLIPVisionTower(**low_res_cfg) + self.low_res_size = low_res_cfg["image_size"] + self.concat_type = concat_type + + self.high_layer_norm = nn.LayerNorm(high_res_cfg.get("output_dim", 1024)) + self.low_layer_norm = nn.LayerNorm(low_res_cfg.get("output_dim", 1024)) + + if freeze_high: + for p_name, p in self.vision_tower_high.named_parameters(): + p.requires_grad = False + self.vision_tower_high = self.vision_tower_high.eval() + else: + # train donwsamples and neck + for p_name, p in self.vision_tower_high.named_parameters(): + if "downsamples" in p_name or "neck" in p_name: + p.requires_grad = True + else: + p.requires_grad = False + + if freeze_low: + for p in self.vision_tower_low.parameters(): + p.requires_grad = False + self.vision_tower_low = self.vision_tower_low.eval() + + self.resize = torchvision.transforms.Resize(self.low_res_size, antialias=True) + + def forward(self, images: torch.Tensor): + """ + + Args: + images (torch.Tensor): [bs, 3, H, W] + + Returns: + res (torch.Tensor): [bs, t, c] + """ + + # [bs, c, h, w] + high_images = images + + # [bs, c, h_low, w_low] + low_images = self.resize(images) + + # separately run two vision towers + # run high_res vision tower + high_res = self.vision_tower_high(high_images) + # [bs, c, h, w] -> [bs, h*w, c] + high_res = rearrange(high_res, "b c h w -> b (h w) c") + # run low_res vision tower + low_res = self.vision_tower_low(low_images) + + if self.concat_type == "feature": + images_features = torch.cat([high_res, low_res], dim=-1) + elif self.concat_type == "sequence": + images_features = torch.cat([high_res, low_res], dim=1) + elif self.concat_type == "add": + images_features = high_res + low_res + elif self.concat_type == "tuple": + images_features = (high_res, low_res) + + else: + raise ValueError( + "Currently only support `feature`, `sequence`, `add` and `tuple` concat type." + ) + + return images_features + + +def model_name_to_cls(cls_name): + if "MlpProjector" in cls_name: + cls = MlpProjector + + elif "CLIPVisionTower" in cls_name: + cls = CLIPVisionTower + + elif "HybridVisionTower" in cls_name: + cls = HybridVisionTower + + else: + raise ValueError(f"class_name {cls_name} is invalid.") + + return cls + + +class MultiModalityPreTrainedModel(PreTrainedModel): + config_class = DeepSeekMultiModalityConfig + base_model_prefix = "multi_modality" + _no_split_modules = [] + _skip_keys_device_placement = "past_key_values" + + +@MULTIMODAL_REGISTRY.register_image_feature_input() +@MULTIMODAL_REGISTRY.register_image_pixel_input() +@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data) +class DeepSeekMultiModalityCausalLM(VisionLanguageModelBase): + + def __init__( + self, + config: DeepSeekMultiModalityConfig, + vision_language_config: VisionLanguageConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__( + config, + ) + self.config = config + vision_config = config.vision_config + self.image_size = vision_config.params.high_res_cfg.image_size + vision_cls = model_name_to_cls(vision_config.cls) + self.vision_model = vision_cls(**vision_config.params) + self.vision_tower = self.vision_model + aligner_config = config.aligner_config + aligner_cls = model_name_to_cls(aligner_config.cls) + self.aligner = aligner_cls(aligner_config.params) + + language_config = config.language_config + self.language_model = LlamaModel(language_config) + self.image_processor = VLMImageProcessor(self.image_size) + self.logits_processor = LogitsProcessor(language_config.vocab_size) + self.sampler = Sampler() + self.lm_head = ParallelLMHead( + language_config.vocab_size, language_config.hidden_size + ) + + def sample( + self, + logits: Optional[torch.Tensor], + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def prepare_inputs_embeds( + self, + input_ids: torch.LongTensor, + pixel_values: torch.FloatTensor, + images_seq_mask: torch.LongTensor, + **kwargs, + ): + """ + + Args: + input_ids (torch.LongTensor): [b, T] + pixel_values (torch.FloatTensor): [b, n_images, 3, h, w] + images_seq_mask (torch.BoolTensor): [b, T] + + assert torch.sum(images_seq_mask) == torch.sum(images_emb_mask) + + Returns: + input_embeds (torch.Tensor): [b, T, D] + """ + + bs, n = pixel_values.shape[0:2] + images = rearrange(pixel_values, "b n c h w -> (b n) c h w") + # [b x n, T2, D] + images = images.to(self.vision_model.high_layer_norm.weight.dtype).to( + self.vision_model.high_layer_norm.weight.device + ) + images_embeds = self.aligner(self.vision_model(images)) + + # [b x n, T2, D] -> [b, n x T2, D] + images_embeds = rearrange(images_embeds, "(b n) t d -> b (n t) d", b=bs, n=n) + + # [b, T, D] + input_ids[input_ids < 0] = 0 # ignore the image embeddings + inputs_embeds = self.language_model.get_input_embeddings(input_ids=input_ids) + + # replace with the image embeddings + images_embeds = images_embeds.reshape( + -1, self.config.aligner_config.params.n_embed + ) + inputs_embeds[images_seq_mask] = images_embeds + + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + **kwargs: object, + ): + pixel_values = kwargs.pop("pixel_values", None) + image_features = kwargs.pop("image_features", None) + if image_features and not pixel_values: + pixel_values = image_features + if pixel_values is not None: + image_token_id = 100015 + image_token_mask = input_ids == image_token_id + inputs_embeds = self.prepare_inputs_embeds( + input_ids, + pixel_values.reshape(1, -1, 3, self.image_size, self.image_size), + image_token_mask, + ) + + input_ids = None + else: + inputs_embeds = None + + hidden_states = self.language_model( + input_ids, positions, kv_caches, attn_metadata, inputs_embeds=inputs_embeds + ) + + return hidden_states + + def compute_logits( + self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata + ) -> torch.Tensor: + logits = self.logits_processor( + self.lm_head.weight, hidden_states, sampling_metadata + ) + return logits + + def load_weights(self, weights): + stacked_params_mapping = [ + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "lm" in name: + self.lm_head.weight_loader(self.lm_head.weight, loaded_weight) + continue + if name.startswith("language_model"): + name = name.replace("language_model.model.", "language_model.", 1) + if "rotary_emb.inv_freq" in name: + continue + if "language_model" not in name: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + # if name not in params_dict: + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip experts that are not assigned to this worker. + if ( + "mlp.experts." in name or "mlp.shared_experts." in name + ) and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + # if name.endswith(".bias") and name not in params_dict: + # continue + # Skip experts that are not assigned to this worker. + if ( + "mlp.experts." in name or "mlp.shared_experts." in name + ) and name not in params_dict: + continue + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + + +AutoImageProcessor.register(VLMImageProcessorConfig, VLMImageProcessor) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 60fc756a12e3d..1fd4e174177c8 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -7,7 +7,8 @@ from vllm.logger import init_logger from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, JAISConfig, MLPSpeculatorConfig, - MPTConfig, RWConfig) + MPTConfig, RWConfig, + DeepSeekMultiModalityConfig) if VLLM_USE_MODELSCOPE: from modelscope import AutoConfig @@ -24,6 +25,7 @@ "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) "jais": JAISConfig, "mlp_speculator": MLPSpeculatorConfig, + "multi_modality": DeepSeekMultiModalityConfig, } for name, cls in _CONFIG_REGISTRY.items(): diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index d8170858c2a9a..f79de04a5ad06 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -7,6 +7,7 @@ from vllm.transformers_utils.configs.jais import JAISConfig from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig from vllm.transformers_utils.configs.mpt import MPTConfig +from vllm.transformers_utils.configs.deepseek_vl import DeepSeekMultiModalityConfig __all__ = [ "ChatGLMConfig", @@ -15,4 +16,5 @@ "RWConfig", "JAISConfig", "MLPSpeculatorConfig", + "DeepSeekMultiModalityConfig", ] diff --git a/vllm/transformers_utils/configs/deepseek_vl.py b/vllm/transformers_utils/configs/deepseek_vl.py new file mode 100644 index 0000000000000..cfdf229531f38 --- /dev/null +++ b/vllm/transformers_utils/configs/deepseek_vl.py @@ -0,0 +1,89 @@ +# Copyright (c) 2023-2024 DeepSeek. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import sys + +from transformers import AutoConfig +from transformers import PretrainedConfig +from transformers import LlamaConfig + +if sys.version_info >= (3, 10): + print("Python version is above 3.10, patching the collections module.") + # Monkey patch collections + import collections + import collections.abc + + for type_name in collections.abc.__all__: + setattr(collections, type_name, getattr(collections.abc, type_name)) + from attrdict import AttrDict + + +class VisionConfig(PretrainedConfig): + model_type = "vision" + cls: str = "" + params: AttrDict = {} + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + self.cls = kwargs.get("cls", "") + if not isinstance(self.cls, str): + self.cls = self.cls.__name__ + + self.params = AttrDict(kwargs.get("params", {})) + + +class AlignerConfig(PretrainedConfig): + model_type = "aligner" + cls: str = "" + params: AttrDict = {} + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + self.cls = kwargs.get("cls", "") + if not isinstance(self.cls, str): + self.cls = self.cls.__name__ + + self.params = AttrDict(kwargs.get("params", {})) + + +class DeepSeekMultiModalityConfig(PretrainedConfig): + model_type = "multi_modality" + vision_config: VisionConfig + aligner_config: AlignerConfig + language_config: LlamaConfig + + def __init__(self, **kwargs): + super().__init__(**kwargs) + vision_config = kwargs.get("vision_config", {}) + self.vision_config = VisionConfig(**vision_config) + + aligner_config = kwargs.get("aligner_config", {}) + self.aligner_config = AlignerConfig(**aligner_config) + + language_config = kwargs.get("language_config", {}) + if isinstance(language_config, LlamaConfig): + self.language_config = language_config + else: + self.language_config = LlamaConfig(**language_config) + self.text_config = self.language_config + + +AutoConfig.register("multi_modality", DeepSeekMultiModalityConfig) From de63a4cd553c0b8baa35a53fe2cb33bebcef6e6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Wed, 26 Jun 2024 12:00:02 +0800 Subject: [PATCH 02/47] fix requirement for deepseek-vl --- requirements-common.txt | 1 + vllm/model_executor/models/deepseek_vl.py | 474 +++++++++--------- .../transformers_utils/configs/deepseek_vl.py | 20 +- 3 files changed, 240 insertions(+), 255 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index 05969cfa5d65f..c0f3b14dc0896 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -21,3 +21,4 @@ lm-format-enforcer == 0.10.1 outlines >= 0.0.43 # Requires torch >= 2.1.0 typing_extensions filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 +timm >= 0.9.16 # Required for deepseek-vl model \ No newline at end of file diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py index 88178d9d773a2..80ba6e7a5b911 100644 --- a/vllm/model_executor/models/deepseek_vl.py +++ b/vllm/model_executor/models/deepseek_vl.py @@ -21,8 +21,6 @@ import warnings import copy -from dataclasses import dataclass -from functools import partial from dataclasses import dataclass from functools import partial from typing import ( @@ -46,19 +44,14 @@ import numpy as np import torchvision import torchvision.transforms.functional -import torch.nn.functional as F - - -from einops import rearrange from transformers import PreTrainedModel -from transformers.configuration_utils import PretrainedConfig -from einops import rearrange from PIL import Image from transformers import AutoImageProcessor, PretrainedConfig -from transformers.image_processing_utils import BaseImageProcessor, BatchFeature +from transformers.image_processing_utils import ( + BaseImageProcessor, + BatchFeature, +) from transformers.image_utils import to_numpy_array -from einops import rearrange -from transformers import PreTrainedModel from timm.layers import ( AttentionPoolLatent, DropPath, @@ -68,13 +61,13 @@ PatchEmbed, resample_abs_pos_embed, ) -from timm.models._manipulate import checkpoint_seq, named_apply - +from timm.models._manipulate import checkpoint_seq from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VisionLanguageConfig from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, ) from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -86,7 +79,6 @@ from .vlm_base import VisionLanguageModelBase from vllm.transformers_utils.configs import DeepSeekMultiModalityConfig - ImageType = Union[np.ndarray, torch.Tensor, Image.Image] IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073) IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711) @@ -94,6 +86,18 @@ IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5) +class AttrDict: + + def __init__(self, entries): + for key, value in entries.items(): + if isinstance(value, dict): + entries[key] = AttrDict(value) + self.__dict__.update(entries) + + def get(self, key, default_val=None): + return self.__dict__.get(key, default_val) + + def expand2square(pil_img, background_color): width, height = pil_img.size if width == height: @@ -205,7 +209,8 @@ def resize(self, pil_img: Image) -> np.ndarray: pil_img = torchvision.transforms.functional.resize( pil_img, size, - interpolation=torchvision.transforms.functional.InterpolationMode.BICUBIC, + interpolation=torchvision.transforms.functional.InterpolationMode. + BICUBIC, antialias=True, ) @@ -217,7 +222,10 @@ def resize(self, pil_img: Image) -> np.ndarray: return x - def preprocess(self, images, return_tensors: str = "pt", **kwargs) -> BatchFeature: + def preprocess(self, + images, + return_tensors: str = "pt", + **kwargs) -> BatchFeature: # resize and pad to [self.image_size, self.image_size] # then convert from [H, W, 3] to [3, H, W] # print(images) @@ -233,8 +241,7 @@ def preprocess(self, images, return_tensors: str = "pt", **kwargs) -> BatchFeatu image=image, scale=self.rescale_factor, input_data_format="channels_first", - ) - for image in images + ) for image in images ] # normalize @@ -245,8 +252,7 @@ def preprocess(self, images, return_tensors: str = "pt", **kwargs) -> BatchFeatu mean=self.image_mean, std=self.image_std, input_data_format="channels_first", - ) - for image in images + ) for image in images ] data = {"pixel_values": images} @@ -258,9 +264,10 @@ def default_shape(self): class MlpProjector(nn.Module): + def __init__(self, cfg): super().__init__() - + cfg = AttrDict(cfg) self.cfg = cfg if cfg.projector_type == "identity": @@ -294,14 +301,16 @@ def __init__(self, cfg): self.layers = modules def forward( - self, x_or_tuple: Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor] + self, + x_or_tuple: Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor], ): """ Args: - x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: if it is a tuple of torch.Tensor, - then it comes from the hybrid vision encoder, and x = high_res_x, low_res_x); - otherwise it is the feature from the single vision encoder. + x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: + if it is a tuple of torch.Tensor, + then it comes from the hybrid vision encoder, and x = high_res_x, low_res_x); + otherwise it is the feature from the single vision encoder. Returns: x (torch.Tensor): [b, s, c] @@ -360,7 +369,7 @@ def norm_cdf(x): def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): # type: (torch.Tensor, float, float, float, float) -> torch.Tensor r"""The original timm.models.layers.weight_init.trunc_normal_ can not handle bfloat16 yet, here we first - convert the tensor to float32, apply the trunc_normal_() in float32, and then convert it back to its orignal dtype. + convert the tensor to float32, apply the trunc_normal_() in float32, and then convert it back to its original dtype. Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values outside :math:`[a, b]` redrawn until they are within @@ -387,20 +396,10 @@ def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): def init_weights(self): if self.pos_embed is not None: - trunc_normal_(self.pos_embed, std=self.pos_embed.shape[1] ** -0.5) + trunc_normal_(self.pos_embed, std=self.pos_embed.shape[1]**-0.5) trunc_normal_(self.latent, std=self.latent_dim**-0.5) -def init_weights_vit_timm(module: nn.Module, name: str = "") -> None: - """ViT weight initialization, original timm impl (for reproducibility)""" - if isinstance(module, nn.Linear): - trunc_normal_(module.weight, std=0.02) - if module.bias is not None: - nn.init.zeros_(module.bias) - elif hasattr(module, "init_weights"): - module.init_weights() - - class SigLipAttention(nn.Module): fused_attn: Final[bool] @@ -427,15 +426,13 @@ def __init__( self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) - self.proj_drop = nn.Dropout(proj_drop) if proj_drop > 0.0 else nn.Identity() + self.proj_drop = (nn.Dropout(proj_drop) + if proj_drop > 0.0 else nn.Identity()) def forward(self, x: torch.Tensor) -> torch.Tensor: B, N, C = x.shape - qkv = ( - self.qkv(x) - .reshape(B, N, 3, self.num_heads, self.head_dim) - .permute(2, 0, 3, 1, 4) - ) + qkv = (self.qkv(x).reshape(B, N, 3, self.num_heads, + self.head_dim).permute(2, 0, 3, 1, 4)) q, k, v = qkv.unbind(0) q, k = self.q_norm(q), self.k_norm(k) @@ -460,6 +457,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class LayerScale(nn.Module): + def __init__( self, dim: int, @@ -475,6 +473,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class SigLipBlock(nn.Module): + def __init__( self, dim: int, @@ -501,10 +500,10 @@ def __init__( proj_drop=proj_drop, norm_layer=norm_layer, ) - self.ls1 = ( - LayerScale(dim, init_values=init_values) if init_values else nn.Identity() - ) - self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.ls1 = (LayerScale(dim, init_values=init_values) + if init_values else nn.Identity()) + self.drop_path1 = (DropPath(drop_path) + if drop_path > 0.0 else nn.Identity()) self.norm2 = norm_layer(dim) self.mlp = mlp_layer( @@ -513,10 +512,10 @@ def __init__( act_layer=act_layer, drop=proj_drop, ) - self.ls2 = ( - LayerScale(dim, init_values=init_values) if init_values else nn.Identity() - ) - self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.ls2 = (LayerScale(dim, init_values=init_values) + if init_values else nn.Identity()) + self.drop_path2 = (DropPath(drop_path) + if drop_path > 0.0 else nn.Identity()) def forward(self, x: torch.Tensor) -> torch.Tensor: x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x)))) @@ -560,7 +559,6 @@ def __init__( proj_drop_rate: float = 0.0, attn_drop_rate: float = 0.0, drop_path_rate: float = 0.0, - weight_init: Literal["skip", "jax", "jax_nlhb", "moco", ""] = "", embed_layer: Callable = PatchEmbed, norm_layer: Optional[LayerType] = None, act_layer: Optional[LayerType] = None, @@ -573,7 +571,7 @@ def __init__( img_size: Input image size. patch_size: Patch size. in_chans: Number of image input channels. - num_classes: Mumber of classes for classification head. + num_classes: Number of classes for classification head. global_pool: Type of global pooling for final sequence (default: 'token'). embed_dim: Transformer embedding dimension. depth: Depth of transformer. @@ -635,16 +633,14 @@ def __init__( ) num_patches = self.patch_embed.num_patches - self.cls_token = ( - nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None - ) - self.reg_token = ( - nn.Parameter(torch.zeros(1, reg_tokens, embed_dim)) if reg_tokens else None - ) - embed_len = ( - num_patches if no_embed_class else num_patches + self.num_prefix_tokens - ) - self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02) + self.cls_token = (nn.Parameter(torch.zeros(1, 1, embed_dim)) + if class_token else None) + self.reg_token = (nn.Parameter(torch.zeros(1, reg_tokens, embed_dim)) + if reg_tokens else None) + embed_len = (num_patches if no_embed_class else num_patches + + self.num_prefix_tokens) + self.pos_embed = nn.Parameter( + torch.randn(1, embed_len, embed_dim) * 0.02) self.pos_drop = nn.Dropout(p=pos_drop_rate) if patch_drop_rate > 0: self.patch_drop = PatchDropout( @@ -655,28 +651,24 @@ def __init__( self.patch_drop = nn.Identity() self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity() - dpr = [ - x.item() for x in torch.linspace(0, drop_path_rate, depth) - ] # stochastic depth decay rule - self.blocks = nn.Sequential( - *[ - block_fn( - dim=embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_norm=qk_norm, - init_values=init_values, - proj_drop=proj_drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[i], - norm_layer=norm_layer, - act_layer=act_layer, - mlp_layer=mlp_layer, - ) - for i in range(depth) - ] - ) + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + self.blocks = nn.Sequential(*[ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_norm=qk_norm, + init_values=init_values, + proj_drop=proj_drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + mlp_layer=mlp_layer, + ) for i in range(depth) + ]) self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity() # Classifier Head @@ -692,20 +684,8 @@ def __init__( self.attn_pool = None self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity() self.head_drop = nn.Dropout(drop_rate) - self.head = ( - nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() - ) - - if weight_init != "skip": - self.init_weights(weight_init) - - def init_weights(self, mode: Literal["jax", "jax_nlhb", "moco", ""] = "") -> None: - assert mode in ("jax", "jax_nlhb", "moco", "") - # head_bias = -math.log(self.num_classes) if "nlhb" in mode else 0.0 - trunc_normal_(self.pos_embed, std=0.02) - if self.cls_token is not None: - nn.init.normal_(self.cls_token, std=1e-6) - named_apply(init_weights_vit_timm, self) + self.head = (nn.Linear(self.embed_dim, num_classes) + if num_classes > 0 else nn.Identity()) @torch.jit.ignore def no_weight_decay(self) -> Set: @@ -715,7 +695,7 @@ def no_weight_decay(self) -> Set: def group_matcher(self, coarse: bool = False) -> Dict: return dict( stem=r"^cls_token|pos_embed|patch_embed", # stem and embed - blocks=[(r"^blocks\.(\d+)", None), (r"^norm", (99999,))], + blocks=[(r"^blocks\.(\d+)", None), (r"^norm", (99999, ))], ) @torch.jit.ignore @@ -731,15 +711,14 @@ def reset_classifier(self, num_classes: int, global_pool=None) -> None: if global_pool is not None: assert global_pool in ("", "avg", "token", "map") if global_pool == "map" and self.attn_pool is None: - assert ( - False - ), "Cannot currently add attention pooling in reset_classifier()." + raise AssertionError( + "Cannot currently add attention pooling in reset_classifier()." + ) elif global_pool != "map " and self.attn_pool is not None: self.attn_pool = None # remove attention pooling self.global_pool = global_pool - self.head = ( - nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() - ) + self.head = (nn.Linear(self.embed_dim, num_classes) + if num_classes > 0 else nn.Identity()) def _pos_embed(self, x: torch.Tensor) -> torch.Tensor: if self.dynamic_img_size: @@ -747,7 +726,8 @@ def _pos_embed(self, x: torch.Tensor) -> torch.Tensor: pos_embed = resample_abs_pos_embed( self.pos_embed, (H, W), - num_prefix_tokens=0 if self.no_embed_class else self.num_prefix_tokens, + num_prefix_tokens=(0 if self.no_embed_class else + self.num_prefix_tokens), ) x = x.view(B, -1, C) else: @@ -781,8 +761,7 @@ def _intermediate_layers( ) -> List[torch.Tensor]: outputs, num_blocks = [], len(self.blocks) take_indices = set( - range(num_blocks - n, num_blocks) if isinstance(n, int) else n - ) + range(num_blocks - n, num_blocks) if isinstance(n, int) else n) # forward pass x = self.patch_embed(x) @@ -811,15 +790,14 @@ def get_intermediate_layers( outputs = self._intermediate_layers(x, n) if norm: outputs = [self.norm(out) for out in outputs] - prefix_tokens = [out[:, 0 : self.num_prefix_tokens] for out in outputs] - outputs = [out[:, self.num_prefix_tokens :] for out in outputs] + prefix_tokens = [out[:, 0:self.num_prefix_tokens] for out in outputs] + outputs = [out[:, self.num_prefix_tokens:] for out in outputs] if reshape: grid_size = self.patch_embed.grid_size outputs = [ - out.reshape(x.shape[0], grid_size[0], grid_size[1], -1) - .permute(0, 3, 1, 2) - .contiguous() + out.reshape(x.shape[0], grid_size[0], grid_size[1], + -1).permute(0, 3, 1, 2).contiguous() for out in outputs ] @@ -839,11 +817,13 @@ def forward_features(self, x: torch.Tensor) -> torch.Tensor: x = self.norm(x) return x - def forward_head(self, x: torch.Tensor, pre_logits: bool = False) -> torch.Tensor: + def forward_head(self, + x: torch.Tensor, + pre_logits: bool = False) -> torch.Tensor: if self.attn_pool is not None: x = self.attn_pool(x) elif self.global_pool == "avg": - x = x[:, self.num_prefix_tokens :].mean(dim=1) + x = x[:, self.num_prefix_tokens:].mean(dim=1) elif self.global_pool: x = x[:, 0] # class token x = self.fc_norm(x) @@ -912,9 +892,8 @@ def create_siglip_vit( ckpt_path: str = "", **kwargs, ): - assert ( - model_name in SigLIP_MODEL_CONFIG.keys() - ), f"model name should be in {SigLIP_MODEL_CONFIG.keys()}" + assert (model_name in SigLIP_MODEL_CONFIG + ), f"model name should be in {SigLIP_MODEL_CONFIG.keys()}" vision_cfg = SigLIPVisionCfg(**SigLIP_MODEL_CONFIG[model_name]) @@ -933,7 +912,6 @@ def create_siglip_vit( class_token=vision_cfg.class_token, global_pool=vision_cfg.global_pool, ignore_head=kwargs.get("ignore_head", True), - weight_init=kwargs.get("weight_init", "skip"), num_classes=0, ) @@ -941,15 +919,14 @@ def create_siglip_vit( state_dict = torch.load(ckpt_path, map_location="cpu") incompatible_keys = model.load_state_dict(state_dict, strict=False) - print( - f"SigLIP-ViT restores from {ckpt_path},\n" - f"\tincompatible_keys:', {incompatible_keys}." - ) + print(f"SigLIP-ViT restores from {ckpt_path},\n" + f"\tincompatible_keys:', {incompatible_keys}.") return model class MLPBlock(nn.Module): + def __init__( self, embedding_dim: int, @@ -968,6 +945,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa class LayerNorm2d(nn.Module): + def __init__(self, num_channels: int, eps: float = 1e-6) -> None: super().__init__() self.weight = nn.Parameter(torch.ones(num_channels)) @@ -984,25 +962,26 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa class ImageEncoderViT(nn.Module): + def __init__( - self, - img_size: int = 1024, - patch_size: int = 16, - in_chans: int = 3, - embed_dim: int = 768, - depth: int = 12, - num_heads: int = 12, - mlp_ratio: float = 4.0, - out_chans: int = 256, - qkv_bias: bool = True, - norm_layer: Type[nn.Module] = nn.LayerNorm, - act_layer: Type[nn.Module] = nn.GELU, - use_abs_pos: bool = True, - use_rel_pos: bool = False, - rel_pos_zero_init: bool = True, - window_size: int = 0, - global_attn_indexes: Tuple[int, ...] = (), - downsample_channels: Tuple[int, ...] = (512, 1024), + self, + img_size: int = 1024, + patch_size: int = 16, + in_chans: int = 3, + embed_dim: int = 768, + depth: int = 12, + num_heads: int = 12, + mlp_ratio: float = 4.0, + out_chans: int = 256, + qkv_bias: bool = True, + norm_layer: Type[nn.Module] = nn.LayerNorm, + act_layer: Type[nn.Module] = nn.GELU, + use_abs_pos: bool = True, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + window_size: int = 0, + global_attn_indexes: Tuple[int, ...] = (), + downsample_channels: Tuple[int, ...] = (512, 1024), ) -> None: """ Args: @@ -1038,9 +1017,11 @@ def __init__( # Initialize absolute positional embedding with pretrain image size. self.pos_embed = nn.Parameter( torch.zeros( - 1, img_size // patch_size, img_size // patch_size, embed_dim - ) - ) + 1, + img_size // patch_size, + img_size // patch_size, + embed_dim, + )) self.blocks = nn.ModuleList() for i in range(depth): @@ -1088,8 +1069,7 @@ def __init__( stride=2, padding=1, bias=False, - ) - ) + )) in_channels = out_channels self.downsamples = nn.Sequential(*downsamples) @@ -1113,13 +1093,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.neck(x.permute(0, 3, 1, 2)) x_dtype = x.dtype - x = F.interpolate( - x.float(), size=(96, 96), mode="bilinear", align_corners=False - ).to(x_dtype) + x = F.interpolate(x.float(), + size=(96, 96), + mode="bilinear", + align_corners=False).to(x_dtype) x = self.downsamples(x) if self.sam_hd: - first_global_feature = self.neck_hd(global_features[0].permute(0, 3, 1, 2)) + first_global_feature = self.neck_hd(global_features[0].permute( + 0, 3, 1, 2)) x_dtype = first_global_feature.dtype first_global_feature = F.interpolate( first_global_feature.float(), @@ -1127,7 +1109,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: mode="bilinear", align_corners=False, ) - first_global_feature = self.downsamples(first_global_feature.to(x_dtype)) + first_global_feature = self.downsamples( + first_global_feature.to(x_dtype)) x = x + first_global_feature * self.hd_alpha_downsamples return x @@ -1172,13 +1155,14 @@ def __init__( qkv_bias=qkv_bias, use_rel_pos=use_rel_pos, rel_pos_zero_init=rel_pos_zero_init, - input_size=input_size if window_size == 0 else (window_size, window_size), + input_size=(input_size if window_size == 0 else + (window_size, window_size)), ) self.norm2 = norm_layer(dim) - self.mlp = MLPBlock( - embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer - ) + self.mlp = MLPBlock(embedding_dim=dim, + mlp_dim=int(dim * mlp_ratio), + act=act_layer) self.window_size = window_size @@ -1237,32 +1221,29 @@ def __init__( input_size is not None ), "Input size must be provided if using relative positional encoding." # initialize relative positional embeddings - self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim)) - self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) + self.rel_pos_h = nn.Parameter( + torch.zeros(2 * input_size[0] - 1, head_dim)) + self.rel_pos_w = nn.Parameter( + torch.zeros(2 * input_size[1] - 1, head_dim)) def forward(self, x: torch.Tensor) -> torch.Tensor: B, H, W, _ = x.shape # qkv with shape (3, B, nHead, H * W, C) - qkv = ( - self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) - ) + qkv = (self.qkv(x).reshape(B, H * W, 3, self.num_heads, + -1).permute(2, 0, 3, 1, 4)) # q, k, v with shape (B * nHead, H * W, C) q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0) def do_attention(q, k, v): attn = (q * self.scale) @ k.transpose(-2, -1) if self.use_rel_pos: - attn = add_decomposed_rel_pos( - attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W) - ) + attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, + self.rel_pos_w, (H, W), (H, W)) attn = attn.softmax(dim=-1) - x = ( - (attn @ v) - .view(B, self.num_heads, H, W, -1) - .permute(0, 2, 3, 1, 4) - .reshape(B, H, W, -1) - ) + x = ((attn @ v).view(B, self.num_heads, H, W, + -1).permute(0, 2, 3, 1, + 4).reshape(B, H, W, -1)) return x @@ -1274,9 +1255,8 @@ def do_attention(q, k, v): return x -def window_partition( - x: torch.Tensor, window_size: int -) -> Tuple[torch.Tensor, Tuple[int, int]]: +def window_partition(x: torch.Tensor, + window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]: """ Partition into non-overlapping windows with padding if needed. Args: @@ -1295,10 +1275,10 @@ def window_partition( x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) Hp, Wp = H + pad_h, W + pad_w - x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) - windows = ( - x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) - ) + x = x.view(B, Hp // window_size, window_size, Wp // window_size, + window_size, C) + windows = (x.permute(0, 1, 3, 2, 4, + 5).contiguous().view(-1, window_size, window_size, C)) return windows, (Hp, Wp) @@ -1322,9 +1302,8 @@ def window_unpartition( Hp, Wp = pad_hw H, W = hw B = windows.shape[0] // (Hp * Wp // window_size // window_size) - x = windows.view( - B, Hp // window_size, Wp // window_size, window_size, window_size, -1 - ) + x = windows.view(B, Hp // window_size, Wp // window_size, window_size, + window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) if Hp > H or Wp > W: @@ -1332,7 +1311,8 @@ def window_unpartition( return x -def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor: +def get_rel_pos(q_size: int, k_size: int, + rel_pos: torch.Tensor) -> torch.Tensor: """ Get relative positional embeddings according to the relative positions of query and key sizes. @@ -1353,14 +1333,16 @@ def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor size=max_rel_dist, mode="linear", ) - rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) + rel_pos_resized = rel_pos_resized.reshape(-1, + max_rel_dist).permute(1, 0) else: rel_pos_resized = rel_pos # Scale the coords with short length if shapes for q and k are different. q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) - relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) + relative_coords = (q_coords - + k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) return rel_pos_resized[relative_coords.long()] @@ -1397,11 +1379,8 @@ def add_decomposed_rel_pos( rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) - attn = ( - attn.view(B, q_h, q_w, k_h, k_w) - + rel_h[:, :, :, :, None] - + rel_w[:, :, :, None, :] - ).view(B, q_h * q_w, k_h * k_w) + attn = (attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + + rel_w[:, :, :, None, :]).view(B, q_h * q_w, k_h * k_w) return attn @@ -1430,7 +1409,11 @@ def __init__( super().__init__() self.proj = nn.Conv2d( - in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding + in_chans, + embed_dim, + kernel_size=kernel_size, + stride=stride, + padding=padding, ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -1492,7 +1475,7 @@ def create_sam_vit( **kwargs, ): assert ( - model_name in SAM_MODEL_CONFIG.keys() + model_name in SAM_MODEL_CONFIG ), f"model name: {model_name} should be in {SAM_MODEL_CONFIG.keys()}" sam_cfg = SAMViTCfg(**SAM_MODEL_CONFIG[model_name]) @@ -1521,6 +1504,7 @@ def create_sam_vit( class CLIPVisionTower(nn.Module): + def __init__( self, model_name: str = "siglip_large_patch16_384", @@ -1548,13 +1532,11 @@ def __init__( } vision_tower_params.update(kwargs) self.vision_tower, self.forward_kwargs = self.build_vision_tower( - vision_tower_params - ) + vision_tower_params) if pixel_mean is not None and pixel_std is not None: - image_norm = torchvision.transforms.Normalize( - mean=pixel_mean, std=pixel_std - ) + image_norm = torchvision.transforms.Normalize(mean=pixel_mean, + std=pixel_std) else: image_norm = None @@ -1573,7 +1555,8 @@ def build_vision_tower(self, vision_tower_params): else: # huggingface from transformers import CLIPVisionModel - vision_tower = CLIPVisionModel.from_pretrained(**vision_tower_params) + vision_tower = CLIPVisionModel.from_pretrained( + **vision_tower_params) forward_kwargs = dict(output_hidden_states=True) return vision_tower, forward_kwargs @@ -1583,18 +1566,18 @@ def feature_select(self, image_forward_outs): # the output has been the self.select_layer"s features image_features = image_forward_outs else: - image_features = image_forward_outs.hidden_states[self.select_layer] + image_features = image_forward_outs.hidden_states[ + self.select_layer] if self.select_feature == "patch": # if the output has cls_token image_features = image_features[:, 1:] - elif self.select_feature == "cls_patch": - image_features = image_features - elif self.select_feature == "same": + elif (self.select_feature == "cls_patch" + or self.select_feature == "same"): image_features = image_features - else: - raise ValueError(f"Unexpected select feature: {self.select_feature}") + raise ValueError( + f"Unexpected select feature: {self.select_feature}") return image_features def forward(self, images): @@ -1616,6 +1599,7 @@ def forward(self, images): class HybridVisionTower(nn.Module): + def __init__( self, high_res_cfg: Dict, @@ -1632,7 +1616,8 @@ def __init__( self.low_res_size = low_res_cfg["image_size"] self.concat_type = concat_type - self.high_layer_norm = nn.LayerNorm(high_res_cfg.get("output_dim", 1024)) + self.high_layer_norm = nn.LayerNorm( + high_res_cfg.get("output_dim", 1024)) self.low_layer_norm = nn.LayerNorm(low_res_cfg.get("output_dim", 1024)) if freeze_high: @@ -1652,7 +1637,8 @@ def __init__( p.requires_grad = False self.vision_tower_low = self.vision_tower_low.eval() - self.resize = torchvision.transforms.Resize(self.low_res_size, antialias=True) + self.resize = torchvision.transforms.Resize(self.low_res_size, + antialias=True) def forward(self, images: torch.Tensor): """ @@ -1674,7 +1660,9 @@ def forward(self, images: torch.Tensor): # run high_res vision tower high_res = self.vision_tower_high(high_images) # [bs, c, h, w] -> [bs, h*w, c] - high_res = rearrange(high_res, "b c h w -> b (h w) c") + b, c, h, w = high_res.shape + high_res = torch.einsum("bchw->bhwc", high_res) + high_res = high_res.reshape(b, h * w, c) # run low_res vision tower low_res = self.vision_tower_low(low_images) @@ -1730,16 +1718,19 @@ def __init__( cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, ): - super().__init__( - config, - ) + super().__init__(config, ) self.config = config vision_config = config.vision_config - self.image_size = vision_config.params.high_res_cfg.image_size + aligner_config = config.aligner_config + self.image_size = aligner_config.params["input_dim"] + self.image_size = vision_config.params.get("image_size") + if not self.image_size: + # Get image size for 7b model + self.image_size = vision_config.params["high_res_cfg"][ + "image_size"] vision_cls = model_name_to_cls(vision_config.cls) self.vision_model = vision_cls(**vision_config.params) self.vision_tower = self.vision_model - aligner_config = config.aligner_config aligner_cls = model_name_to_cls(aligner_config.cls) self.aligner = aligner_cls(aligner_config.params) @@ -1748,9 +1739,8 @@ def __init__( self.image_processor = VLMImageProcessor(self.image_size) self.logits_processor = LogitsProcessor(language_config.vocab_size) self.sampler = Sampler() - self.lm_head = ParallelLMHead( - language_config.vocab_size, language_config.hidden_size - ) + self.lm_head = ParallelLMHead(language_config.vocab_size, + language_config.hidden_size) def sample( self, @@ -1781,24 +1771,26 @@ def prepare_inputs_embeds( """ bs, n = pixel_values.shape[0:2] - images = rearrange(pixel_values, "b n c h w -> (b n) c h w") + p_b, p_n, p_c, p_h, p_w = pixel_values.shape + images = pixel_values.reshape(p_b * p_n, p_c, p_h, p_w) # [b x n, T2, D] - images = images.to(self.vision_model.high_layer_norm.weight.dtype).to( - self.vision_model.high_layer_norm.weight.device - ) + # images = images.to(self.vision_model.high_layer_norm.weight.dtype).to( + # self.vision_model.high_layer_norm.weight.device + # ) images_embeds = self.aligner(self.vision_model(images)) # [b x n, T2, D] -> [b, n x T2, D] - images_embeds = rearrange(images_embeds, "(b n) t d -> b (n t) d", b=bs, n=n) + _, t, d = images_embeds.shape + images_embeds = images_embeds.reshape(bs, n * t, d) # [b, T, D] input_ids[input_ids < 0] = 0 # ignore the image embeddings - inputs_embeds = self.language_model.get_input_embeddings(input_ids=input_ids) + inputs_embeds = self.language_model.get_input_embeddings( + input_ids=input_ids) # replace with the image embeddings images_embeds = images_embeds.reshape( - -1, self.config.aligner_config.params.n_embed - ) + -1, self.config.aligner_config.params["n_embed"]) inputs_embeds[images_seq_mask] = images_embeds return inputs_embeds @@ -1820,7 +1812,8 @@ def forward( image_token_mask = input_ids == image_token_id inputs_embeds = self.prepare_inputs_embeds( input_ids, - pixel_values.reshape(1, -1, 3, self.image_size, self.image_size), + pixel_values.reshape(1, -1, 3, self.image_size, + self.image_size), image_token_mask, ) @@ -1829,17 +1822,19 @@ def forward( inputs_embeds = None hidden_states = self.language_model( - input_ids, positions, kv_caches, attn_metadata, inputs_embeds=inputs_embeds + input_ids, + positions, + kv_caches, + attn_metadata, + inputs_embeds=inputs_embeds, ) return hidden_states - def compute_logits( - self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata - ) -> torch.Tensor: - logits = self.logits_processor( - self.lm_head.weight, hidden_states, sampling_metadata - ) + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head.weight, hidden_states, + sampling_metadata) return logits def load_weights(self, weights): @@ -1857,12 +1852,14 @@ def load_weights(self, weights): self.lm_head.weight_loader(self.lm_head.weight, loaded_weight) continue if name.startswith("language_model"): - name = name.replace("language_model.model.", "language_model.", 1) + name = name.replace("language_model.model.", "language_model.", + 1) if "rotary_emb.inv_freq" in name: continue if "language_model" not in name: param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) weight_loader(param, loaded_weight) continue for param_name, weight_name, shard_id in stacked_params_mapping: @@ -1874,9 +1871,8 @@ def load_weights(self, weights): if name.endswith(".bias") and name not in params_dict: continue # Skip experts that are not assigned to this worker. - if ( - "mlp.experts." in name or "mlp.shared_experts." in name - ) and name not in params_dict: + if ("mlp.experts." in name or "mlp.shared_experts." + in name) and name not in params_dict: continue param = params_dict[name] weight_loader = param.weight_loader @@ -1887,14 +1883,14 @@ def load_weights(self, weights): # if name.endswith(".bias") and name not in params_dict: # continue # Skip experts that are not assigned to this worker. - if ( - "mlp.experts." in name or "mlp.shared_experts." in name - ) and name not in params_dict: + if ("mlp.experts." in name or "mlp.shared_experts." + in name) and name not in params_dict: continue if name not in params_dict: continue param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) weight_loader(param, loaded_weight) diff --git a/vllm/transformers_utils/configs/deepseek_vl.py b/vllm/transformers_utils/configs/deepseek_vl.py index cfdf229531f38..b14d8cbdf5b3c 100644 --- a/vllm/transformers_utils/configs/deepseek_vl.py +++ b/vllm/transformers_utils/configs/deepseek_vl.py @@ -17,27 +17,15 @@ # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import sys - from transformers import AutoConfig from transformers import PretrainedConfig from transformers import LlamaConfig -if sys.version_info >= (3, 10): - print("Python version is above 3.10, patching the collections module.") - # Monkey patch collections - import collections - import collections.abc - - for type_name in collections.abc.__all__: - setattr(collections, type_name, getattr(collections.abc, type_name)) - from attrdict import AttrDict - class VisionConfig(PretrainedConfig): model_type = "vision" cls: str = "" - params: AttrDict = {} + params: dict = {} def __init__(self, **kwargs): super().__init__(**kwargs) @@ -46,13 +34,13 @@ def __init__(self, **kwargs): if not isinstance(self.cls, str): self.cls = self.cls.__name__ - self.params = AttrDict(kwargs.get("params", {})) + self.params = kwargs.get("params", {}) class AlignerConfig(PretrainedConfig): model_type = "aligner" cls: str = "" - params: AttrDict = {} + params: dict = {} def __init__(self, **kwargs): super().__init__(**kwargs) @@ -61,7 +49,7 @@ def __init__(self, **kwargs): if not isinstance(self.cls, str): self.cls = self.cls.__name__ - self.params = AttrDict(kwargs.get("params", {})) + self.params = kwargs.get("params", {}) class DeepSeekMultiModalityConfig(PretrainedConfig): From 10b5cddb5e6492a274d05dfd615179d886888ea4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Wed, 26 Jun 2024 17:27:28 +0800 Subject: [PATCH 03/47] Removed unused code, added documentation, added examples --- docs/source/models/supported_models.rst | 4 + examples/deepseek_vl_example.py | 125 ++++++++++++++++++++++ vllm/model_executor/models/deepseek_vl.py | 13 +-- 3 files changed, 131 insertions(+), 11 deletions(-) create mode 100644 examples/deepseek_vl_example.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 47737ae525209..86e9304e50272 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -163,6 +163,10 @@ Alongside each architecture, we include some popular models that use it. - Xverse - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc. - + * - :code:`DeepSeekMultiModalityCausalLM` + - deepseek-ai + - :code:`deepseek-ai/deepseek-vl-1.3b-chat`, :code:`deepseek-ai/deepseek-vl-7b-chat`, etc. + - If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py new file mode 100644 index 0000000000000..ec610a622ad35 --- /dev/null +++ b/examples/deepseek_vl_example.py @@ -0,0 +1,125 @@ +import argparse +import os +import subprocess + +import torch +from PIL import Image + +from vllm import LLM +from vllm.multimodal.image import ImageFeatureData, ImagePixelData +from vllm.model_executor.models.deepseek_vl import VLMImageProcessor + +# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`. +# You can use `.buildkite/download-images.sh` to download them +from vllm import SamplingParams + +sample_params = SamplingParams(temperature=0, max_tokens=1024) + +model = "deepseek-ai/deepseek-vl-7b-chat" + + +def run_deepseek_vl_pixel_values(*, disable_image_processor: bool = False): + llm = LLM( + model=model, + image_input_type="pixel_values", + image_token_id=100015, + image_input_shape="1,3,1024,1024", + image_feature_size=576, + disable_image_processor=False, + gpu_memory_utilization=0.9, + max_model_len=3072, + enforce_eager=True, + ) + + prompt = f"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: {''*576} Describe the content of this image.\nAssistant:" + + if disable_image_processor: + image = get_image_features() + else: + image = Image.open("images/stop_sign.jpg") + + outputs = llm.generate( + { + "prompt": prompt, + "multi_modal_data": ImagePixelData(image), + }, + sampling_params=sample_params, + ) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +def run_deepseek_vl_image_features(): + llm = LLM( + model=model, + image_input_type="image_features", + image_token_id=100015, + image_input_shape="1,3,1024,1024", + image_feature_size=576, + gpu_memory_utilization=0.9, + max_model_len=3072, + enforce_eager=True, + ) + prompt = f"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: {''*576} Describe the content of this image.\nAssistant:" + + image: torch.Tensor = get_image_features() + + outputs = llm.generate( + { + "prompt": prompt, + "multi_modal_data": ImageFeatureData(image), + }, + sampling_params=sample_params, + ) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +def get_image_features(): + image_feature = VLMImageProcessor(1024)(Image.open("images/stop_sign.jpg"))[ + "pixel_values" + ] + torch.save(image_feature, "images/deepseek_vl_stop_sign.pt") + return torch.load("images/deepseek_vl_stop_sign.pt") + + +def main(args): + if args.type == "pixel_values": + run_deepseek_vl_pixel_values() + else: + run_deepseek_vl_image_features() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Demo on deepseek-vl") + parser.add_argument( + "--type", + type=str, + choices=["pixel_values", "image_features"], + default="pixel_values", + help="image input type", + ) + args = parser.parse_args() + # Download from s3 + s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/" + local_directory = "images" + + # Make sure the local directory exists or create it + os.makedirs(local_directory, exist_ok=True) + + # Use AWS CLI to sync the directory, assume anonymous access + subprocess.check_call( + [ + "aws", + "s3", + "sync", + s3_bucket_path, + local_directory, + "--no-sign-request", + ] + ) + main(args) diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py index 80ba6e7a5b911..ec1ff36e5ab0a 100644 --- a/vllm/model_executor/models/deepseek_vl.py +++ b/vllm/model_executor/models/deepseek_vl.py @@ -167,7 +167,7 @@ def __init__( 0.27577711, ), rescale_factor: float = 1.0 / 255.0, - do_normalize: bool = True, + do_normalize: bool = False, **kwargs, ): super().__init__(**kwargs) @@ -228,7 +228,6 @@ def preprocess(self, **kwargs) -> BatchFeature: # resize and pad to [self.image_size, self.image_size] # then convert from [H, W, 3] to [3, H, W] - # print(images) if not isinstance(images, List): images = [ images, @@ -1773,10 +1772,6 @@ def prepare_inputs_embeds( bs, n = pixel_values.shape[0:2] p_b, p_n, p_c, p_h, p_w = pixel_values.shape images = pixel_values.reshape(p_b * p_n, p_c, p_h, p_w) - # [b x n, T2, D] - # images = images.to(self.vision_model.high_layer_norm.weight.dtype).to( - # self.vision_model.high_layer_norm.weight.device - # ) images_embeds = self.aligner(self.vision_model(images)) # [b x n, T2, D] -> [b, n x T2, D] @@ -1805,7 +1800,7 @@ def forward( ): pixel_values = kwargs.pop("pixel_values", None) image_features = kwargs.pop("image_features", None) - if image_features and not pixel_values: + if image_features is not None and pixel_values is None: pixel_values = image_features if pixel_values is not None: image_token_id = 100015 @@ -1879,10 +1874,6 @@ def load_weights(self, weights): weight_loader(param, loaded_weight, shard_id) break else: - # Skip loading extra bias for GPTQ models. - # if name.endswith(".bias") and name not in params_dict: - # continue - # Skip experts that are not assigned to this worker. if ("mlp.experts." in name or "mlp.shared_experts." in name) and name not in params_dict: continue From 09633373747b86beffbc4df13528a9358e388ffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Wed, 26 Jun 2024 18:16:48 +0800 Subject: [PATCH 04/47] Added test cases, deleted model dependencies, and optimized code --- requirements-common.txt | 3 +- tests/models/test_deepseek_vl.py | 128 ++++++++++++++++++++++ vllm/model_executor/models/deepseek_vl.py | 2 +- 3 files changed, 130 insertions(+), 3 deletions(-) create mode 100644 tests/models/test_deepseek_vl.py diff --git a/requirements-common.txt b/requirements-common.txt index c0f3b14dc0896..a0063062f4c36 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -20,5 +20,4 @@ tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer == 0.10.1 outlines >= 0.0.43 # Requires torch >= 2.1.0 typing_extensions -filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 -timm >= 0.9.16 # Required for deepseek-vl model \ No newline at end of file +filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 \ No newline at end of file diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py new file mode 100644 index 0000000000000..1b74592d5e571 --- /dev/null +++ b/tests/models/test_deepseek_vl.py @@ -0,0 +1,128 @@ +from typing import List, Tuple + +import pytest +from transformers import AutoTokenizer + +from vllm.config import VisionLanguageConfig + +from ..conftest import IMAGE_FILES + +pytestmark = pytest.mark.vlm + +# The image token is placed before "user" on purpose so that the test can pass +HF_IMAGE_PROMPTS = [ + "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: What's the content of the image?\nAssistant:", + "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: What is the season?\nAssistant:", +] + +assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) + + +def iter_llava_configs(model_name: str): + image_hw_to_feature_size = { + (1024, 1024): 576, + } + + for (h, w), f in image_hw_to_feature_size.items(): + for input_type, input_shape in [ + (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)), + (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)), + ]: + yield ( + model_name, + VisionLanguageConfig( + image_input_type=input_type, + image_feature_size=f, + image_token_id=100015, + image_input_shape=input_shape, + image_processor=model_name, + image_processor_revision=None, + ), + ) + + +model_and_vl_config = [ + *iter_llava_configs("deepseek-ai/deepseek-vl-7b-chat"), +] + + +def vllm_to_hf_output( + vllm_output: Tuple[List[int], str], vlm_config: VisionLanguageConfig, model_id: str +): + """Sanitize vllm output to be comparable with hf output. + The function reduces `input_ids` from 1, 32000, 32000, ..., 32000, + x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... + It also reduces `output_str` from "bla" to "bla". + """ + input_ids, output_str = vllm_output + image_token_id = vlm_config.image_token_id + + tokenizer = AutoTokenizer.from_pretrained(model_id) + image_token_str = tokenizer.decode(image_token_id) + + hf_input_ids = [ + input_id + for idx, input_id in enumerate(input_ids) + if input_id != image_token_id or input_ids[idx - 1] != image_token_id + ] + hf_output_str = output_str.replace( + image_token_str * vlm_config.image_feature_size, "" + ) + + return hf_input_ids, hf_output_str + + +# TODO: Add test for `tensor_parallel_size` [ref: PR #3883] +@pytest.mark.parametrize("model_and_config", model_and_vl_config) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [128]) +def test_models( + hf_runner, + vllm_runner, + hf_images, + vllm_images, + model_and_config, + dtype: str, + max_tokens: int, +) -> None: + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test is under tests/images. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalData objects and corresponding + vision language config as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + model_id, vlm_config = model_and_config + + with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: + hf_outputs = hf_model.generate_greedy( + HF_IMAGE_PROMPTS, max_tokens, images=hf_images + ) + + vllm_image_prompts = [ + p.replace( + "", "" * vlm_config.image_feature_size + ) + for p in HF_IMAGE_PROMPTS + ] + + with vllm_runner( + model_id, dtype=dtype, enforce_eager=True, **vlm_config.as_cli_args_dict() + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy( + vllm_image_prompts, max_tokens, images=vllm_images + ) + + for i in range(len(HF_IMAGE_PROMPTS)): + hf_output_ids, hf_output_str = hf_outputs[i] + vllm_output_ids, vllm_output_str = vllm_to_hf_output( + vllm_outputs[i], vlm_config, model_id + ) + assert ( + hf_output_str == vllm_output_str + ), f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}" + assert ( + hf_output_ids == vllm_output_ids + ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}" diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py index ec1ff36e5ab0a..4f9ca839cac75 100644 --- a/vllm/model_executor/models/deepseek_vl.py +++ b/vllm/model_executor/models/deepseek_vl.py @@ -1734,7 +1734,7 @@ def __init__( self.aligner = aligner_cls(aligner_config.params) language_config = config.language_config - self.language_model = LlamaModel(language_config) + self.language_model = LlamaModel(language_config, cache_config, quant_config) self.image_processor = VLMImageProcessor(self.image_size) self.logits_processor = LogitsProcessor(language_config.vocab_size) self.sampler = Sampler() From 9752b0c2362c288f8a7e32d8f7e37ef89a8bc15c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Thu, 27 Jun 2024 15:31:08 +0800 Subject: [PATCH 05/47] code reformat --- examples/deepseek_vl_example.py | 23 ++++++------ tests/models/test_deepseek_vl.py | 43 ++++++++++------------- vllm/model_executor/models/deepseek_vl.py | 11 +++--- 3 files changed, 35 insertions(+), 42 deletions(-) diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py index ec610a622ad35..2032fb2a9473a 100644 --- a/examples/deepseek_vl_example.py +++ b/examples/deepseek_vl_example.py @@ -80,9 +80,8 @@ def run_deepseek_vl_image_features(): def get_image_features(): - image_feature = VLMImageProcessor(1024)(Image.open("images/stop_sign.jpg"))[ - "pixel_values" - ] + image_feature = VLMImageProcessor(1024)( + Image.open("images/stop_sign.jpg"))["pixel_values"] torch.save(image_feature, "images/deepseek_vl_stop_sign.pt") return torch.load("images/deepseek_vl_stop_sign.pt") @@ -112,14 +111,12 @@ def main(args): os.makedirs(local_directory, exist_ok=True) # Use AWS CLI to sync the directory, assume anonymous access - subprocess.check_call( - [ - "aws", - "s3", - "sync", - s3_bucket_path, - local_directory, - "--no-sign-request", - ] - ) + subprocess.check_call([ + "aws", + "s3", + "sync", + s3_bucket_path, + local_directory, + "--no-sign-request", + ]) main(args) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 1b74592d5e571..892e89422f2d4 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -46,9 +46,8 @@ def iter_llava_configs(model_name: str): ] -def vllm_to_hf_output( - vllm_output: Tuple[List[int], str], vlm_config: VisionLanguageConfig, model_id: str -): +def vllm_to_hf_output(vllm_output: Tuple[List[int], str], + vlm_config: VisionLanguageConfig, model_id: str): """Sanitize vllm output to be comparable with hf output. The function reduces `input_ids` from 1, 32000, 32000, ..., 32000, x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... @@ -61,13 +60,11 @@ def vllm_to_hf_output( image_token_str = tokenizer.decode(image_token_id) hf_input_ids = [ - input_id - for idx, input_id in enumerate(input_ids) + input_id for idx, input_id in enumerate(input_ids) if input_id != image_token_id or input_ids[idx - 1] != image_token_id ] hf_output_str = output_str.replace( - image_token_str * vlm_config.image_feature_size, "" - ) + image_token_str * vlm_config.image_feature_size, "") return hf_input_ids, hf_output_str @@ -97,32 +94,30 @@ def test_models( model_id, vlm_config = model_and_config with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: - hf_outputs = hf_model.generate_greedy( - HF_IMAGE_PROMPTS, max_tokens, images=hf_images - ) + hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, + max_tokens, + images=hf_images) vllm_image_prompts = [ - p.replace( - "", "" * vlm_config.image_feature_size - ) + p.replace("", + "" * vlm_config.image_feature_size) for p in HF_IMAGE_PROMPTS ] - with vllm_runner( - model_id, dtype=dtype, enforce_eager=True, **vlm_config.as_cli_args_dict() - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy( - vllm_image_prompts, max_tokens, images=vllm_images - ) + with vllm_runner(model_id, + dtype=dtype, + enforce_eager=True, + **vlm_config.as_cli_args_dict()) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, + max_tokens, + images=vllm_images) for i in range(len(HF_IMAGE_PROMPTS)): hf_output_ids, hf_output_str = hf_outputs[i] vllm_output_ids, vllm_output_str = vllm_to_hf_output( - vllm_outputs[i], vlm_config, model_id - ) + vllm_outputs[i], vlm_config, model_id) assert ( hf_output_str == vllm_output_str ), f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}" - assert ( - hf_output_ids == vllm_output_ids - ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}" + assert (hf_output_ids == vllm_output_ids + ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}" diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py index 4f9ca839cac75..5d1033b85a416 100644 --- a/vllm/model_executor/models/deepseek_vl.py +++ b/vllm/model_executor/models/deepseek_vl.py @@ -1004,7 +1004,7 @@ def __init__( super().__init__() self.img_size = img_size - self.patch_embed = PatchEmbed( + self.patch_embed = ImagePatchEmbed( kernel_size=(patch_size, patch_size), stride=(patch_size, patch_size), in_chans=in_chans, @@ -1384,7 +1384,7 @@ def add_decomposed_rel_pos( return attn -class PatchEmbed(nn.Module): +class ImagePatchEmbed(nn.Module): """ Image to Patch Embedding. """ @@ -1551,8 +1551,8 @@ def build_vision_tower(self, vision_tower_params): vision_tower = create_sam_vit(**vision_tower_params) forward_kwargs = dict() - else: # huggingface - from transformers import CLIPVisionModel + else: + from vllm.model_executor.models.clip import CLIPVisionModel vision_tower = CLIPVisionModel.from_pretrained( **vision_tower_params) @@ -1734,7 +1734,8 @@ def __init__( self.aligner = aligner_cls(aligner_config.params) language_config = config.language_config - self.language_model = LlamaModel(language_config, cache_config, quant_config) + self.language_model = LlamaModel(language_config, cache_config, + quant_config) self.image_processor = VLMImageProcessor(self.image_size) self.logits_processor = LogitsProcessor(language_config.vocab_size) self.sampler = Sampler() From de6879e11a2e3735377a905e5845b8f238bc0d4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Fri, 28 Jun 2024 11:03:11 +0800 Subject: [PATCH 06/47] Remove timm dependency and Code Formatting --- vllm/model_executor/models/deepseek_vl.py | 519 +++++++++++++++++++++- 1 file changed, 498 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py index 5d1033b85a416..ea37399c5b8de 100644 --- a/vllm/model_executor/models/deepseek_vl.py +++ b/vllm/model_executor/models/deepseek_vl.py @@ -20,8 +20,10 @@ import math import warnings import copy +import collections.abc from dataclasses import dataclass +from enum import Enum from functools import partial from typing import ( Callable, @@ -36,11 +38,14 @@ Type, Union, ) +from itertools import repeat import torch import torch.nn as nn import torchvision.transforms import torch.nn.functional as F +from torch import _assert +from torch.utils.checkpoint import checkpoint import numpy as np import torchvision import torchvision.transforms.functional @@ -52,16 +57,6 @@ BatchFeature, ) from transformers.image_utils import to_numpy_array -from timm.layers import ( - AttentionPoolLatent, - DropPath, - LayerType, - Mlp, - PatchDropout, - PatchEmbed, - resample_abs_pos_embed, -) -from timm.models._manipulate import checkpoint_seq from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VisionLanguageConfig @@ -84,6 +79,490 @@ IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711) IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5) +LayerType = Union[str, Callable, Type[torch.nn.Module]] + + +# From PyTorch internals +def _ntuple(n): + + def parse(x): + if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): + return tuple(x) + return tuple(repeat(x, n)) + + return parse + + +to_2tuple = _ntuple(2) + + +class Format(str, Enum): + NCHW = "NCHW" + NHWC = "NHWC" + NCL = "NCL" + NLC = "NLC" + + +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/attention_pool.py +class AttentionPoolLatent(nn.Module): + """Attention pooling w/ latent query""" + + fused_attn: torch.jit.Final[bool] + + def __init__( + self, + in_features: int, + out_features: int = None, + embed_dim: int = None, + num_heads: int = 8, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + qk_norm: bool = False, + latent_len: int = 1, + latent_dim: int = None, + pos_embed: str = "", + pool_type: str = "token", + norm_layer: Optional[nn.Module] = None, + drop: float = 0.0, + ): + super().__init__() + embed_dim = embed_dim or in_features + out_features = out_features or in_features + assert embed_dim % num_heads == 0 + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.scale = self.head_dim**-0.5 + self.pool = pool_type + self.fused_attn = True + + if pos_embed == "abs": + spatial_len = self.feat_size + self.pos_embed = nn.Parameter(torch.zeros(spatial_len, + in_features)) + else: + self.pos_embed = None + + self.latent_dim = latent_dim or embed_dim + self.latent_len = latent_len + self.latent = nn.Parameter(torch.zeros(1, self.latent_len, embed_dim)) + + self.q = nn.Linear(embed_dim, embed_dim, bias=qkv_bias) + self.kv = nn.Linear(embed_dim, embed_dim * 2, bias=qkv_bias) + self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.proj = nn.Linear(embed_dim, embed_dim) + self.proj_drop = nn.Dropout(drop) + + self.norm = (norm_layer(out_features) + if norm_layer is not None else nn.Identity()) + self.mlp = Mlp(embed_dim, int(embed_dim * mlp_ratio)) + + def forward(self, x): + B, N, C = x.shape + + if self.pos_embed is not None: + # FIXME interpolate + x = x + self.pos_embed.unsqueeze(0).to(x.dtype) + + q_latent = self.latent.expand(B, -1, -1) + q = (self.q(q_latent).reshape(B, self.latent_len, self.num_heads, + self.head_dim).transpose(1, 2)) + + kv = (self.kv(x).reshape(B, N, 2, self.num_heads, + self.head_dim).permute(2, 0, 3, 1, 4)) + k, v = kv.unbind(0) + + q, k = self.q_norm(q), self.k_norm(k) + + if self.fused_attn: + x = F.scaled_dot_product_attention(q, k, v) + else: + q = q * self.scale + attn = q @ k.transpose(-2, -1) + attn = attn.softmax(dim=-1) + x = attn @ v + x = x.transpose(1, 2).reshape(B, self.latent_len, C) + x = self.proj(x) + x = self.proj_drop(x) + + x = x + self.mlp(self.norm(x)) + + # optional pool if latent seq_len > 1 and pooled output is desired + if self.pool == "token": + x = x[:, 0] + elif self.pool == "avg": + x = x.mean(1) + return x + + +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py +def drop_path(x, + drop_prob: float = 0.0, + training: bool = False, + scale_by_keep: bool = True): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + + """ + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0], ) + (1, ) * ( + x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + + +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) + + def extra_repr(self): + return f"drop_prob={round(self.drop_prob,3):0.3f}" + + +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/mlp.py +class Mlp(nn.Module): + """MLP as used in Vision Transformer, MLP-Mixer and related networks""" + + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + norm_layer=None, + bias=True, + drop=0.0, + use_conv=False, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + bias = to_2tuple(bias) + drop_probs = to_2tuple(drop) + linear_layer = partial(nn.Conv2d, + kernel_size=1) if use_conv else nn.Linear + + self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0]) + self.act = act_layer() + self.drop1 = nn.Dropout(drop_probs[0]) + self.norm = (norm_layer(hidden_features) + if norm_layer is not None else nn.Identity()) + self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1]) + self.drop2 = nn.Dropout(drop_probs[1]) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop1(x) + x = self.norm(x) + x = self.fc2(x) + x = self.drop2(x) + return x + + +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/patch_dropout.py +class PatchDropout(nn.Module): + """ + https://arxiv.org/abs/2212.00794 + """ + + return_indices: torch.jit.Final[bool] + + def __init__( + self, + prob: float = 0.5, + num_prefix_tokens: int = 1, + ordered: bool = False, + return_indices: bool = False, + ): + super().__init__() + assert 0 <= prob < 1.0 + self.prob = prob + self.num_prefix_tokens = ( + num_prefix_tokens # exclude CLS token (or other prefix tokens) + ) + self.ordered = ordered + self.return_indices = return_indices + + def forward( + self, x + ) -> Union[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor]]]: + if not self.training or self.prob == 0.0: + if self.return_indices: + return x, None + return x + + if self.num_prefix_tokens: + prefix_tokens, x = ( + x[:, :self.num_prefix_tokens], + x[:, self.num_prefix_tokens:], + ) + else: + prefix_tokens = None + + B = x.shape[0] + L = x.shape[1] + num_keep = max(1, int(L * (1.0 - self.prob))) + keep_indices = torch.argsort(torch.randn(B, L, device=x.device), + dim=-1)[:, :num_keep] + if self.ordered: + # NOTE does not need to maintain patch order in typical transformer use, + # but possibly useful for debug / visualization + keep_indices = keep_indices.sort(dim=-1)[0] + x = x.gather(1, + keep_indices.unsqueeze(-1).expand((-1, -1) + x.shape[2:])) + + if prefix_tokens is not None: + x = torch.cat((prefix_tokens, x), dim=1) + + if self.return_indices: + return x, keep_indices + return x + + +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/patch_embed.py +class PatchEmbed(nn.Module): + """2D Image to Patch Embedding""" + + output_fmt: Format + dynamic_img_pad: torch.jit.Final[bool] + + def __init__( + self, + img_size: Optional[int] = 224, + patch_size: int = 16, + in_chans: int = 3, + embed_dim: int = 768, + norm_layer: Optional[Callable] = None, + flatten: bool = True, + output_fmt: Optional[str] = None, + bias: bool = True, + strict_img_size: bool = True, + dynamic_img_pad: bool = False, + ): + super().__init__() + self.patch_size = to_2tuple(patch_size) + if img_size is not None: + self.img_size = to_2tuple(img_size) + self.grid_size = tuple( + [s // p for s, p in zip(self.img_size, self.patch_size)]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + else: + self.img_size = None + self.grid_size = None + self.num_patches = None + + if output_fmt is not None: + self.flatten = False + self.output_fmt = Format(output_fmt) + else: + # flatten spatial dim and transpose to channels last, kept for bwd compat + self.flatten = flatten + self.output_fmt = Format.NCHW + self.strict_img_size = strict_img_size + self.dynamic_img_pad = dynamic_img_pad + + self.proj = nn.Conv2d(in_chans, + embed_dim, + kernel_size=patch_size, + stride=patch_size, + bias=bias) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def feat_ratio(self, as_scalar=True) -> Union[Tuple[int, int], int]: + if as_scalar: + return max(self.patch_size) + else: + return self.patch_size + + def dynamic_feat_size(self, img_size: Tuple[int, int]) -> Tuple[int, int]: + """Get grid (feature) size for given image size taking account of dynamic padding. + NOTE: must be torchscript compatible so using fixed tuple indexing + """ + if self.dynamic_img_pad: + return math.ceil(img_size[0] / self.patch_size[0]), math.ceil( + img_size[1] / self.patch_size[1]) + else: + return img_size[0] // self.patch_size[0], img_size[ + 1] // self.patch_size[1] + + def forward(self, x): + B, C, H, W = x.shape + if self.img_size is not None: + if self.strict_img_size: + _assert( + H == self.img_size[0], + f"Input height ({H}) doesn't match model ({self.img_size[0]}).", + ) + _assert( + W == self.img_size[1], + f"Input width ({W}) doesn't match model ({self.img_size[1]}).", + ) + elif not self.dynamic_img_pad: + _assert( + H % self.patch_size[0] == 0, + f"Input height ({H}) should be divisible by patch size ({self.patch_size[0]}).", + ) + _assert( + W % self.patch_size[1] == 0, + f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]}).", + ) + if self.dynamic_img_pad: + pad_h = (self.patch_size[0] - + H % self.patch_size[0]) % self.patch_size[0] + pad_w = (self.patch_size[1] - + W % self.patch_size[1]) % self.patch_size[1] + x = F.pad(x, (0, pad_w, 0, pad_h)) + x = self.proj(x) + if self.flatten: + x = x.flatten(2).transpose(1, 2) # NCHW -> NLC + elif self.output_fmt != Format.NCHW: + x = nchw_to(x, self.output_fmt) + x = self.norm(x) + return x + + +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/pos_embed.py +def resample_abs_pos_embed( + posemb, + new_size: List[int], + old_size: Optional[List[int]] = None, + num_prefix_tokens: int = 1, + interpolation: str = "bicubic", + antialias: bool = True, + verbose: bool = False, +): + # sort out sizes, assume square if old size not provided + num_pos_tokens = posemb.shape[1] + num_new_tokens = new_size[0] * new_size[1] + num_prefix_tokens + if num_new_tokens == num_pos_tokens and new_size[0] == new_size[1]: + return posemb + + if old_size is None: + hw = int(math.sqrt(num_pos_tokens - num_prefix_tokens)) + old_size = hw, hw + + if num_prefix_tokens: + posemb_prefix, posemb = ( + posemb[:, :num_prefix_tokens], + posemb[:, num_prefix_tokens:], + ) + else: + posemb_prefix, posemb = None, posemb + + # do the interpolation + embed_dim = posemb.shape[-1] + orig_dtype = posemb.dtype + posemb = posemb.float() # interpolate needs float32 + posemb = posemb.reshape(1, old_size[0], old_size[1], + -1).permute(0, 3, 1, 2) + posemb = F.interpolate(posemb, + size=new_size, + mode=interpolation, + antialias=antialias) + posemb = posemb.permute(0, 2, 3, 1).reshape(1, -1, embed_dim) + posemb = posemb.to(orig_dtype) + + # add back extra (class, etc) prefix tokens + if posemb_prefix is not None: + posemb = torch.cat([posemb_prefix, posemb], dim=1) + + if not torch.jit.is_scripting() and verbose: + print(f"Resized position embedding: {old_size} to {new_size}.") + + return posemb + + +def checkpoint_seq(functions, + x, + every=1, + flatten=False, + skip_last=False, + preserve_rng_state=True): + r"""A helper function for checkpointing sequential models. + + Sequential models execute a list of modules/functions in order + (sequentially). Therefore, we can divide such a sequence into segments + and checkpoint each segment. All segments except run in :func:`torch.no_grad` + manner, i.e., not storing the intermediate activations. The inputs of each + checkpointed segment will be saved for re-running the segment in the backward pass. + + See :func:`~torch.utils.checkpoint.checkpoint` on how checkpointing works. + + .. warning:: + Checkpointing currently only supports :func:`torch.autograd.backward` + and only if its `inputs` argument is not passed. :func:`torch.autograd.grad` + is not supported. + + .. warning: + At least one of the inputs needs to have :code:`requires_grad=True` if + grads are needed for model inputs, otherwise the checkpointed part of the + model won't have gradients. + + Args: + functions: A :class:`torch.nn.Sequential` or the list of modules or functions to run sequentially. + x: A Tensor that is input to :attr:`functions` + every: checkpoint every-n functions (default: 1) + flatten (bool): flatten nn.Sequential of nn.Sequentials + skip_last (bool): skip checkpointing the last function in the sequence if True + preserve_rng_state (bool, optional, default=True): Omit stashing and restoring + the RNG state during each checkpoint. + + Returns: + Output of running :attr:`functions` sequentially on :attr:`*inputs` + + Example: + >>> model = nn.Sequential(...) + >>> input_var = checkpoint_seq(model, input_var, every=2) + """ + + def run_function(start, end, functions): + + def forward(_x): + for j in range(start, end + 1): + _x = functions[j](_x) + return _x + + return forward + + if isinstance(functions, torch.nn.Sequential): + functions = functions.children() + if flatten: + functions = chain.from_iterable(functions) + if not isinstance(functions, (tuple, list)): + functions = tuple(functions) + + num_checkpointed = len(functions) + if skip_last: + num_checkpointed -= 1 + end = -1 + for start in range(0, num_checkpointed, every): + end = min(start + every - 1, num_checkpointed - 1) + x = checkpoint( + run_function(start, end, functions), + x, + preserve_rng_state=preserve_rng_state, + ) + if skip_last: + return run_function(end + 1, len(functions) - 1, functions)(x) + return x class AttrDict: @@ -306,7 +785,7 @@ def forward( """ Args: - x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: + x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: if it is a tuple of torch.Tensor, then it comes from the hybrid vision encoder, and x = high_res_x, low_res_x); otherwise it is the feature from the single vision encoder. @@ -417,7 +896,6 @@ def __init__( self.num_heads = num_heads self.head_dim = dim // num_heads self.scale = self.head_dim**-0.5 - # self.fused_attn = use_fused_attn() self.fused_attn = True self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) @@ -425,8 +903,8 @@ def __init__( self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) - self.proj_drop = (nn.Dropout(proj_drop) - if proj_drop > 0.0 else nn.Identity()) + self.proj_drop = nn.Dropout( + proj_drop) if proj_drop > 0.0 else nn.Identity() def forward(self, x: torch.Tensor) -> torch.Tensor: B, N, C = x.shape @@ -501,8 +979,8 @@ def __init__( ) self.ls1 = (LayerScale(dim, init_values=init_values) if init_values else nn.Identity()) - self.drop_path1 = (DropPath(drop_path) - if drop_path > 0.0 else nn.Identity()) + self.drop_path1 = DropPath( + drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) self.mlp = mlp_layer( @@ -513,8 +991,8 @@ def __init__( ) self.ls2 = (LayerScale(dim, init_values=init_values) if init_values else nn.Identity()) - self.drop_path2 = (DropPath(drop_path) - if drop_path > 0.0 else nn.Identity()) + self.drop_path2 = DropPath( + drop_path) if drop_path > 0.0 else nn.Identity() def forward(self, x: torch.Tensor) -> torch.Tensor: x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x)))) @@ -1551,7 +2029,7 @@ def build_vision_tower(self, vision_tower_params): vision_tower = create_sam_vit(**vision_tower_params) forward_kwargs = dict() - else: + else: from vllm.model_executor.models.clip import CLIPVisionModel vision_tower = CLIPVisionModel.from_pretrained( @@ -1571,8 +2049,7 @@ def feature_select(self, image_forward_outs): if self.select_feature == "patch": # if the output has cls_token image_features = image_features[:, 1:] - elif (self.select_feature == "cls_patch" - or self.select_feature == "same"): + elif self.select_feature == "cls_patch" or self.select_feature == "same": image_features = image_features else: raise ValueError( From 7cf06711bf30caba04f16873e6ddb58673cb7421 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Fri, 28 Jun 2024 15:41:46 +0800 Subject: [PATCH 07/47] fix test failed --- requirements-test.txt | 3 +++ tests/models/test_deepseek_vl.py | 20 +++++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/requirements-test.txt b/requirements-test.txt index 8b68e0e939669..df14077d12fcf 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -20,3 +20,6 @@ aiohttp # quantization bitsandbytes==0.42.0 + +# Model +deepseek_vl@git+https://github.com/deepseek-ai/DeepSeek-VL.git@681bffb \ No newline at end of file diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 892e89422f2d4..bdf5c69528d9c 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -1,6 +1,7 @@ from typing import List, Tuple import pytest +import deepseek_vl.models from transformers import AutoTokenizer from vllm.config import VisionLanguageConfig @@ -18,7 +19,7 @@ assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) -def iter_llava_configs(model_name: str): +def iter_deepseek_vl_configs(model_name: str): image_hw_to_feature_size = { (1024, 1024): 576, } @@ -42,7 +43,7 @@ def iter_llava_configs(model_name: str): model_and_vl_config = [ - *iter_llava_configs("deepseek-ai/deepseek-vl-7b-chat"), + *iter_deepseek_vl_configs("deepseek-ai/deepseek-vl-7b-chat"), ] @@ -93,6 +94,14 @@ def test_models( """ model_id, vlm_config = model_and_config + with vllm_runner(model_id, + dtype=dtype, + enforce_eager=True, + **vlm_config.as_cli_args_dict()) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, + max_tokens, + images=vllm_images) + with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, max_tokens, @@ -104,13 +113,6 @@ def test_models( for p in HF_IMAGE_PROMPTS ] - with vllm_runner(model_id, - dtype=dtype, - enforce_eager=True, - **vlm_config.as_cli_args_dict()) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, - max_tokens, - images=vllm_images) for i in range(len(HF_IMAGE_PROMPTS)): hf_output_ids, hf_output_str = hf_outputs[i] From d2d3eeb6e129b2583136307436c011625d1cd35e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Fri, 28 Jun 2024 16:53:22 +0800 Subject: [PATCH 08/47] Modify the deepseek-vl version number --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index df14077d12fcf..e8b7d7e626748 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -22,4 +22,4 @@ aiohttp bitsandbytes==0.42.0 # Model -deepseek_vl@git+https://github.com/deepseek-ai/DeepSeek-VL.git@681bffb \ No newline at end of file +deepseek_vl@git+https://github.com/deepseek-ai/DeepSeek-VL.git@main \ No newline at end of file From 23311f65c97d4cf1f7d3a0402a6a6762348f0bf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Mon, 1 Jul 2024 09:30:14 +0800 Subject: [PATCH 09/47] Delete failed test cases and dependencies to resolve conflicts --- requirements-test.txt | 5 +- tests/models/test_deepseek_vl.py | 125 ------------------------- vllm/model_executor/models/__init__.py | 1 - 3 files changed, 1 insertion(+), 130 deletions(-) delete mode 100644 tests/models/test_deepseek_vl.py diff --git a/requirements-test.txt b/requirements-test.txt index e8b7d7e626748..3ebfc16547e44 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -19,7 +19,4 @@ sentence-transformers # required for embedding aiohttp # quantization -bitsandbytes==0.42.0 - -# Model -deepseek_vl@git+https://github.com/deepseek-ai/DeepSeek-VL.git@main \ No newline at end of file +bitsandbytes==0.42.0 \ No newline at end of file diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py deleted file mode 100644 index bdf5c69528d9c..0000000000000 --- a/tests/models/test_deepseek_vl.py +++ /dev/null @@ -1,125 +0,0 @@ -from typing import List, Tuple - -import pytest -import deepseek_vl.models -from transformers import AutoTokenizer - -from vllm.config import VisionLanguageConfig - -from ..conftest import IMAGE_FILES - -pytestmark = pytest.mark.vlm - -# The image token is placed before "user" on purpose so that the test can pass -HF_IMAGE_PROMPTS = [ - "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: What's the content of the image?\nAssistant:", - "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: What is the season?\nAssistant:", -] - -assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES) - - -def iter_deepseek_vl_configs(model_name: str): - image_hw_to_feature_size = { - (1024, 1024): 576, - } - - for (h, w), f in image_hw_to_feature_size.items(): - for input_type, input_shape in [ - (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)), - (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)), - ]: - yield ( - model_name, - VisionLanguageConfig( - image_input_type=input_type, - image_feature_size=f, - image_token_id=100015, - image_input_shape=input_shape, - image_processor=model_name, - image_processor_revision=None, - ), - ) - - -model_and_vl_config = [ - *iter_deepseek_vl_configs("deepseek-ai/deepseek-vl-7b-chat"), -] - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str], - vlm_config: VisionLanguageConfig, model_id: str): - """Sanitize vllm output to be comparable with hf output. - The function reduces `input_ids` from 1, 32000, 32000, ..., 32000, - x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... - It also reduces `output_str` from "bla" to "bla". - """ - input_ids, output_str = vllm_output - image_token_id = vlm_config.image_token_id - - tokenizer = AutoTokenizer.from_pretrained(model_id) - image_token_str = tokenizer.decode(image_token_id) - - hf_input_ids = [ - input_id for idx, input_id in enumerate(input_ids) - if input_id != image_token_id or input_ids[idx - 1] != image_token_id - ] - hf_output_str = output_str.replace( - image_token_str * vlm_config.image_feature_size, "") - - return hf_input_ids, hf_output_str - - -# TODO: Add test for `tensor_parallel_size` [ref: PR #3883] -@pytest.mark.parametrize("model_and_config", model_and_vl_config) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -def test_models( - hf_runner, - vllm_runner, - hf_images, - vllm_images, - model_and_config, - dtype: str, - max_tokens: int, -) -> None: - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test is under tests/images. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalData objects and corresponding - vision language config as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - model_id, vlm_config = model_and_config - - with vllm_runner(model_id, - dtype=dtype, - enforce_eager=True, - **vlm_config.as_cli_args_dict()) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, - max_tokens, - images=vllm_images) - - with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: - hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, - max_tokens, - images=hf_images) - - vllm_image_prompts = [ - p.replace("", - "" * vlm_config.image_feature_size) - for p in HF_IMAGE_PROMPTS - ] - - - for i in range(len(HF_IMAGE_PROMPTS)): - hf_output_ids, hf_output_str = hf_outputs[i] - vllm_output_ids, vllm_output_str = vllm_to_hf_output( - vllm_outputs[i], vlm_config, model_id) - assert ( - hf_output_str == vllm_output_str - ), f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}" - assert (hf_output_ids == vllm_output_ids - ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}" diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 4c708478f716a..e7ced618c7be7 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -21,7 +21,6 @@ "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"), "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"), - "MultiModalityCausalLM": ("deepseek_vl", "DeepSeekMultiModalityCausalLM"), "FalconForCausalLM": ("falcon", "FalconForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"), From 89d785663931f6e2cb3f29ae3e7553bc62cee54b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Mon, 1 Jul 2024 09:33:19 +0800 Subject: [PATCH 10/47] resolve conflicts --- vllm/model_executor/models/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 69a65ff023bc9..b4f01a5dc98aa 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -22,6 +22,7 @@ "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"), "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"), + "MultiModalityCausalLM": ("deepseek_vl", "DeepSeekMultiModalityCausalLM"), "FalconForCausalLM": ("falcon", "FalconForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"), From 1eb7d483dbfa768fbfd0424aa08068e8c307b976 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Mon, 1 Jul 2024 10:07:12 +0800 Subject: [PATCH 11/47] fix code bug --- vllm/model_executor/models/deepseek_vl.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py index ea37399c5b8de..dded8443ff708 100644 --- a/vllm/model_executor/models/deepseek_vl.py +++ b/vllm/model_executor/models/deepseek_vl.py @@ -38,7 +38,7 @@ Type, Union, ) -from itertools import repeat +from itertools import (repeat, chain) import torch import torch.nn as nn @@ -103,6 +103,16 @@ class Format(str, Enum): NLC = "NLC" +def nchw_to(x: torch.Tensor, fmt: Format): + if fmt == Format.NHWC: + x = x.permute(0, 2, 3, 1) + elif fmt == Format.NLC: + x = x.flatten(2).transpose(1, 2) + elif fmt == Format.NCL: + x = x.flatten(2) + return x + + # From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/attention_pool.py class AttentionPoolLatent(nn.Module): """Attention pooling w/ latent query""" @@ -408,11 +418,11 @@ def forward(self, x): if self.img_size is not None: if self.strict_img_size: _assert( - H == self.img_size[0], + self.img_size[0] == H, f"Input height ({H}) doesn't match model ({self.img_size[0]}).", ) _assert( - W == self.img_size[1], + self.img_size[1] == W, f"Input width ({W}) doesn't match model ({self.img_size[1]}).", ) elif not self.dynamic_img_pad: @@ -490,6 +500,7 @@ def resample_abs_pos_embed( return posemb +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/_manipulate.py def checkpoint_seq(functions, x, every=1, From 0f127c663ed054f073dab2dddc79049855ba9382 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Mon, 1 Jul 2024 15:49:31 +0800 Subject: [PATCH 12/47] fix error Line too long --- examples/deepseek_vl_example.py | 17 +- vllm/model_executor/models/deepseek_vl.py | 305 ++++++++---------- vllm/transformers_utils/config.py | 4 +- vllm/transformers_utils/configs/__init__.py | 3 +- .../transformers_utils/configs/deepseek_vl.py | 12 +- 5 files changed, 150 insertions(+), 191 deletions(-) diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py index 2032fb2a9473a..715464635d599 100644 --- a/examples/deepseek_vl_example.py +++ b/examples/deepseek_vl_example.py @@ -5,17 +5,21 @@ import torch from PIL import Image -from vllm import LLM -from vllm.multimodal.image import ImageFeatureData, ImagePixelData -from vllm.model_executor.models.deepseek_vl import VLMImageProcessor - # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`. # You can use `.buildkite/download-images.sh` to download them -from vllm import SamplingParams +from vllm import LLM, SamplingParams +from vllm.model_executor.models.deepseek_vl import VLMImageProcessor +from vllm.multimodal.image import ImageFeatureData, ImagePixelData sample_params = SamplingParams(temperature=0, max_tokens=1024) model = "deepseek-ai/deepseek-vl-7b-chat" +prompt = "You are a helpful language and vision assistant." \ + "You are able to understand the visual content that the user provides," \ + "and assist the user with a variety of tasks using natural language.\n" \ + "User: Describe the content of this image.\nAssistant:" + +prompt = prompt.replace("", "" * 576) def run_deepseek_vl_pixel_values(*, disable_image_processor: bool = False): @@ -31,8 +35,6 @@ def run_deepseek_vl_pixel_values(*, disable_image_processor: bool = False): enforce_eager=True, ) - prompt = f"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: {''*576} Describe the content of this image.\nAssistant:" - if disable_image_processor: image = get_image_features() else: @@ -62,7 +64,6 @@ def run_deepseek_vl_image_features(): max_model_len=3072, enforce_eager=True, ) - prompt = f"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: {''*576} Describe the content of this image.\nAssistant:" image: torch.Tensor = get_image_features() diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py index dded8443ff708..27fd994be94d2 100644 --- a/vllm/model_executor/models/deepseek_vl.py +++ b/vllm/model_executor/models/deepseek_vl.py @@ -1,68 +1,53 @@ # Copyright (c) 2023-2024 DeepSeek. # -# Permission is hereby granted, free of charge, to any person obtaining a copy of +# Permission is hereby granted,free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in # the Software without restriction, including without limitation the rights to -# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, +# use, copy, modify, merge, publish, distribute,sublicense,and/or sell copies of +# the Software,and to permit persons to whom the Software is furnished to do so, # subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# IMPLIED,INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,FITNESS # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +import collections.abc +import copy import math import warnings -import copy -import collections.abc - from dataclasses import dataclass from enum import Enum from functools import partial -from typing import ( - Callable, - Dict, - Final, - List, - Literal, - Optional, - Sequence, - Set, - Tuple, - Type, - Union, -) -from itertools import (repeat, chain) +from itertools import chain, repeat +from typing import (Callable, Dict, Final, List, Literal, Optional, Sequence, + Set, Tuple, Type, Union) +import numpy as np import torch import torch.nn as nn -import torchvision.transforms import torch.nn.functional as F -from torch import _assert -from torch.utils.checkpoint import checkpoint -import numpy as np import torchvision +import torchvision.transforms import torchvision.transforms.functional -from transformers import PreTrainedModel from PIL import Image -from transformers import AutoImageProcessor, PretrainedConfig -from transformers.image_processing_utils import ( - BaseImageProcessor, - BatchFeature, -) +from torch import _assert +from torch.utils.checkpoint import checkpoint +from transformers import AutoImageProcessor, PretrainedConfig, PreTrainedModel +from transformers.image_processing_utils import (BaseImageProcessor, + BatchFeature) from transformers.image_utils import to_numpy_array from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VisionLanguageConfig from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig, ) + QuantizationConfig) from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -71,9 +56,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import get_dummy_image_data from vllm.sequence import SamplerOutput -from .vlm_base import VisionLanguageModelBase from vllm.transformers_utils.configs import DeepSeekMultiModalityConfig +from .vlm_base import VisionLanguageModelBase + ImageType = Union[np.ndarray, torch.Tensor, Image.Image] IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073) IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711) @@ -113,7 +99,7 @@ def nchw_to(x: torch.Tensor, fmt: Format): return x -# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/attention_pool.py +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/attention_pool.py # noqa class AttentionPoolLatent(nn.Module): """Attention pooling w/ latent query""" @@ -205,20 +191,13 @@ def forward(self, x): return x -# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py -def drop_path(x, - drop_prob: float = 0.0, - training: bool = False, - scale_by_keep: bool = True): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - - This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, - the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... - See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for - changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use - 'survival rate' as the argument. - - """ +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py # noqa +def drop_path( + x, + drop_prob: float = 0.0, + training: bool = False, + scale_by_keep: bool = True, +): if drop_prob == 0.0 or not training: return x keep_prob = 1 - drop_prob @@ -230,9 +209,12 @@ def drop_path(x, return x * random_tensor -# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py # noqa class DropPath(nn.Module): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + """ + Drop paths (Stochastic Depth) per sample + (when applied in main path of residual blocks). + """ def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True): super(DropPath, self).__init__() @@ -246,7 +228,7 @@ def extra_repr(self): return f"drop_prob={round(self.drop_prob,3):0.3f}" -# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/mlp.py +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/mlp.py # noqa class Mlp(nn.Module): """MLP as used in Vision Transformer, MLP-Mixer and related networks""" @@ -266,8 +248,8 @@ def __init__( hidden_features = hidden_features or in_features bias = to_2tuple(bias) drop_probs = to_2tuple(drop) - linear_layer = partial(nn.Conv2d, - kernel_size=1) if use_conv else nn.Linear + linear_layer = (partial(nn.Conv2d, kernel_size=1) + if use_conv else nn.Linear) self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0]) self.act = act_layer() @@ -287,7 +269,7 @@ def forward(self, x): return x -# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/patch_dropout.py +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/patch_dropout.py # noqa class PatchDropout(nn.Module): """ https://arxiv.org/abs/2212.00794 @@ -333,7 +315,8 @@ def forward( keep_indices = torch.argsort(torch.randn(B, L, device=x.device), dim=-1)[:, :num_keep] if self.ordered: - # NOTE does not need to maintain patch order in typical transformer use, + # NOTE does not need to maintain patch order in typical + # transformer use, # but possibly useful for debug / visualization keep_indices = keep_indices.sort(dim=-1)[0] x = x.gather(1, @@ -347,7 +330,7 @@ def forward( return x -# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/patch_embed.py +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/patch_embed.py # noqa class PatchEmbed(nn.Module): """2D Image to Patch Embedding""" @@ -383,17 +366,20 @@ def __init__( self.flatten = False self.output_fmt = Format(output_fmt) else: - # flatten spatial dim and transpose to channels last, kept for bwd compat + # flatten spatial dim and transpose to channels last, + # kept for bwd compat self.flatten = flatten self.output_fmt = Format.NCHW self.strict_img_size = strict_img_size self.dynamic_img_pad = dynamic_img_pad - self.proj = nn.Conv2d(in_chans, - embed_dim, - kernel_size=patch_size, - stride=patch_size, - bias=bias) + self.proj = nn.Conv2d( + in_chans, + embed_dim, + kernel_size=patch_size, + stride=patch_size, + bias=bias, + ) self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() def feat_ratio(self, as_scalar=True) -> Union[Tuple[int, int], int]: @@ -403,15 +389,18 @@ def feat_ratio(self, as_scalar=True) -> Union[Tuple[int, int], int]: return self.patch_size def dynamic_feat_size(self, img_size: Tuple[int, int]) -> Tuple[int, int]: - """Get grid (feature) size for given image size taking account of dynamic padding. + """Get grid (feature) size for given image size taking account + of dynamic padding. NOTE: must be torchscript compatible so using fixed tuple indexing """ if self.dynamic_img_pad: return math.ceil(img_size[0] / self.patch_size[0]), math.ceil( img_size[1] / self.patch_size[1]) else: - return img_size[0] // self.patch_size[0], img_size[ - 1] // self.patch_size[1] + return ( + img_size[0] // self.patch_size[0], + img_size[1] // self.patch_size[1], + ) def forward(self, x): B, C, H, W = x.shape @@ -419,20 +408,20 @@ def forward(self, x): if self.strict_img_size: _assert( self.img_size[0] == H, - f"Input height ({H}) doesn't match model ({self.img_size[0]}).", + f"Input height ({H}) doesn't match model ({self.img_size[0]}).", # noqa ) _assert( self.img_size[1] == W, - f"Input width ({W}) doesn't match model ({self.img_size[1]}).", + f"Input width ({W}) doesn't match model ({self.img_size[1]}).", # noqa ) elif not self.dynamic_img_pad: _assert( H % self.patch_size[0] == 0, - f"Input height ({H}) should be divisible by patch size ({self.patch_size[0]}).", + f"Input height ({H}) should be divisible by patch size ({self.patch_size[0]}).", # noqa ) _assert( W % self.patch_size[1] == 0, - f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]}).", + f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]}).", # noqa ) if self.dynamic_img_pad: pad_h = (self.patch_size[0] - @@ -449,7 +438,7 @@ def forward(self, x): return x -# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/pos_embed.py +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/pos_embed.py # noqa def resample_abs_pos_embed( posemb, new_size: List[int], @@ -500,49 +489,15 @@ def resample_abs_pos_embed( return posemb -# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/_manipulate.py -def checkpoint_seq(functions, - x, - every=1, - flatten=False, - skip_last=False, - preserve_rng_state=True): - r"""A helper function for checkpointing sequential models. - - Sequential models execute a list of modules/functions in order - (sequentially). Therefore, we can divide such a sequence into segments - and checkpoint each segment. All segments except run in :func:`torch.no_grad` - manner, i.e., not storing the intermediate activations. The inputs of each - checkpointed segment will be saved for re-running the segment in the backward pass. - - See :func:`~torch.utils.checkpoint.checkpoint` on how checkpointing works. - - .. warning:: - Checkpointing currently only supports :func:`torch.autograd.backward` - and only if its `inputs` argument is not passed. :func:`torch.autograd.grad` - is not supported. - - .. warning: - At least one of the inputs needs to have :code:`requires_grad=True` if - grads are needed for model inputs, otherwise the checkpointed part of the - model won't have gradients. - - Args: - functions: A :class:`torch.nn.Sequential` or the list of modules or functions to run sequentially. - x: A Tensor that is input to :attr:`functions` - every: checkpoint every-n functions (default: 1) - flatten (bool): flatten nn.Sequential of nn.Sequentials - skip_last (bool): skip checkpointing the last function in the sequence if True - preserve_rng_state (bool, optional, default=True): Omit stashing and restoring - the RNG state during each checkpoint. - - Returns: - Output of running :attr:`functions` sequentially on :attr:`*inputs` - - Example: - >>> model = nn.Sequential(...) - >>> input_var = checkpoint_seq(model, input_var, every=2) - """ +# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/_manipulate.py # noqa +def checkpoint_seq( + functions, + x, + every=1, + flatten=False, + skip_last=False, + preserve_rng_state=True, +): def run_function(start, end, functions): @@ -798,7 +753,8 @@ def forward( Args: x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: if it is a tuple of torch.Tensor, - then it comes from the hybrid vision encoder, and x = high_res_x, low_res_x); + then it comes from the hybrid vision encoder, + and x = high_res_x, low_res_x); otherwise it is the feature from the single vision encoder. Returns: @@ -818,8 +774,9 @@ def forward( def _no_grad_trunc_normal_(tensor, mean, std, a, b): - # Cut & paste from PyTorch official master until it's in a few official releases - RW - # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + # Cut & paste from PyTorch official master until it's in a few official + # releases - RW Method based on + # https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf def norm_cdf(x): # Computes standard normal cumulative distribution function return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 @@ -856,25 +813,6 @@ def norm_cdf(x): def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): - # type: (torch.Tensor, float, float, float, float) -> torch.Tensor - r"""The original timm.models.layers.weight_init.trunc_normal_ can not handle bfloat16 yet, here we first - convert the tensor to float32, apply the trunc_normal_() in float32, and then convert it back to its original dtype. - Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn - from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` - with values outside :math:`[a, b]` redrawn until they are within - the bounds. The method used for generating the random values works - best when :math:`a \leq \text{mean} \leq b`. - Args: - tensor: an n-dimensional `torch.Tensor` - mean: the mean of the normal distribution - std: the standard deviation of the normal distribution - a: the minimum cutoff value - b: the maximum cutoff value - Examples: - >>> w = torch.empty(3, 5) - >>> nn.init.trunc_normal_(w) - """ - with torch.no_grad(): dtype = tensor.dtype tensor_fp32 = tensor.float() @@ -914,8 +852,8 @@ def __init__( self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) - self.proj_drop = nn.Dropout( - proj_drop) if proj_drop > 0.0 else nn.Identity() + self.proj_drop = (nn.Dropout(proj_drop) + if proj_drop > 0.0 else nn.Identity()) def forward(self, x: torch.Tensor) -> torch.Tensor: B, N, C = x.shape @@ -990,8 +928,8 @@ def __init__( ) self.ls1 = (LayerScale(dim, init_values=init_values) if init_values else nn.Identity()) - self.drop_path1 = DropPath( - drop_path) if drop_path > 0.0 else nn.Identity() + self.drop_path1 = (DropPath(drop_path) + if drop_path > 0.0 else nn.Identity()) self.norm2 = norm_layer(dim) self.mlp = mlp_layer( @@ -1002,8 +940,8 @@ def __init__( ) self.ls2 = (LayerScale(dim, init_values=init_values) if init_values else nn.Identity()) - self.drop_path2 = DropPath( - drop_path) if drop_path > 0.0 else nn.Identity() + self.drop_path2 = (DropPath(drop_path) + if drop_path > 0.0 else nn.Identity()) def forward(self, x: torch.Tensor) -> torch.Tensor: x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x)))) @@ -1014,7 +952,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class VisionTransformer(nn.Module): """Vision Transformer - A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` + A PyTorch impl of : `An Image is Worth 16x16 Words: + Transformers for Image Recognition at Scale` - https://arxiv.org/abs/2010.11929 """ @@ -1060,17 +999,21 @@ def __init__( patch_size: Patch size. in_chans: Number of image input channels. num_classes: Number of classes for classification head. - global_pool: Type of global pooling for final sequence (default: 'token'). + global_pool: Type of global pooling for final sequence + (default: 'token'). embed_dim: Transformer embedding dimension. depth: Depth of transformer. num_heads: Number of attention heads. mlp_ratio: Ratio of mlp hidden dim to embedding dim. qkv_bias: Enable bias for qkv projections if True. - init_values: Layer-scale init values (layer-scale enabled if not None). + init_values: Layer-scale init values + (layer-scale enabled if not None). class_token: Use class token. - no_embed_class: Don't include position embeddings for class (or reg) tokens. + no_embed_class: Don't include position embeddings for class + (or reg) tokens. reg_tokens: Number of register tokens. - fc_norm: Pre head norm after pool (instead of before), if None, enabled when global_pool == 'avg'. + fc_norm: Pre head norm after pool (instead of before), if None, + enabled when global_pool == 'avg'. drop_rate: Head dropout rate. pos_drop_rate: Position embedding dropout rate. attn_drop_rate: Attention dropout rate. @@ -1085,8 +1028,7 @@ def __init__( assert global_pool in ("", "avg", "token", "map") assert class_token or global_pool != "token" use_fc_norm = global_pool == "avg" if fc_norm is None else fc_norm - # norm_layer = get_norm_layer(norm_layer) or partial(nn.LayerNorm, eps=1e-6) - # act_layer = get_act_layer(act_layer) or nn.GELU + norm_layer = partial(nn.LayerNorm, eps=1e-6) act_layer = nn.GELU @@ -1200,7 +1142,7 @@ def reset_classifier(self, num_classes: int, global_pool=None) -> None: assert global_pool in ("", "avg", "token", "map") if global_pool == "map" and self.attn_pool is None: raise AssertionError( - "Cannot currently add attention pooling in reset_classifier()." + "Cannot currently add attention pooling in reset_classifier()." # noqa ) elif global_pool != "map " and self.attn_pool is not None: self.attn_pool = None # remove attention pooling @@ -1229,7 +1171,8 @@ def _pos_embed(self, x: torch.Tensor) -> torch.Tensor: if self.no_embed_class: # deit-3, updated JAX (big vision) - # position embedding does not overlap with class token, add then concat + # position embedding does not overlap with class token, + # add then concat x = x + pos_embed if to_cat: x = torch.cat(to_cat + [x], dim=1) @@ -1274,7 +1217,8 @@ def get_intermediate_layers( """Intermediate layer accessor (NOTE: This is a WIP experiment). Inspired by DINO / DINOv2 interface """ - # take last n blocks if n is an int, if in is a sequence, select by matching indices + # take last n blocks if n is an int, if in is a sequence, + # select by matching indices outputs = self._intermediate_layers(x, n) if norm: outputs = [self.norm(out) for out in outputs] @@ -1484,10 +1428,12 @@ def __init__( norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. use_abs_pos (bool): If True, use absolute positional embeddings. - use_rel_pos (bool): If True, add relative positional embeddings to the attention map. - rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. - window_size (int): Window size for window attention blocks. - global_attn_indexes (list): Indexes for blocks using global attention. + use_rel_pos (bool): If True, add relative positional embeddings to + the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative + positional parameters. window_size (int): Window size for window + attention blocks. global_attn_indexes (list): Indexes for blocks + using global attention. downsample_channels (list): Channels for downsampling layers. """ super().__init__() @@ -1605,7 +1551,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class Block(nn.Module): - """Transformer blocks with support of window attention and residual propagation blocks""" + """ + Transformer blocks with support of window attention and + residual propagation blocks + """ def __init__( self, @@ -1625,14 +1574,18 @@ def __init__( dim (int): Number of input channels. num_heads (int): Number of attention heads in each ViT block. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. - qkv_bias (bool): If True, add a learnable bias to query, key, value. + qkv_bias (bool): If True, add a learnable bias to + query, key, value. norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. - use_rel_pos (bool): If True, add relative positional embeddings to the attention map. - rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. - window_size (int): Window size for window attention blocks. If it equals 0, then - use global attention. - input_size (tuple(int, int) or None): Input resolution for calculating the relative + use_rel_pos (bool): If True, add relative positional embeddings to + the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative + positional parameters. + window_size (int): Window size for window attention blocks. If + it equals 0, then use global attention. input_size + (tuple(int, int) or None): Input resolution for calculating + the relative positional parameter size. """ super().__init__() @@ -1689,10 +1642,14 @@ def __init__( Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. - qkv_bias (bool): If True, add a learnable bias to query, key, value. - rel_pos (bool): If True, add relative positional embeddings to the attention map. - rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. - input_size (tuple(int, int) or None): Input resolution for calculating the relative + qkv_bias (bool): If True, add a learnable bias to + query, key, value. + rel_pos (bool): If True, add relative positional embeddings + to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative + positional parameters. + input_size (tuple(int, int) or None): Input resolution for + calculating the relative positional parameter size. """ super().__init__() @@ -1707,7 +1664,7 @@ def __init__( if self.use_rel_pos: assert ( input_size is not None - ), "Input size must be provided if using relative positional encoding." + ), "Input size must be provided if using relative positional encoding." # noqa # initialize relative positional embeddings self.rel_pos_h = nn.Parameter( torch.zeros(2 * input_size[0] - 1, head_dim)) @@ -1752,7 +1709,8 @@ def window_partition(x: torch.Tensor, window_size (int): window size. Returns: - windows: windows after partition with [B * num_windows, window_size, window_size, C]. + windows: windows after partition with [B * num_windows, window_size, + window_size, C]. (Hp, Wp): padded height and width before partition """ B, H, W, C = x.shape @@ -1779,7 +1737,8 @@ def window_unpartition( """ Window unpartition into original sequences and removing padding. Args: - windows (tensor): input tokens with [B * num_windows, window_size, window_size, C]. + windows (tensor): input tokens with + [B * num_windows, window_size, window_size, C]. window_size (int): window size. pad_hw (Tuple): padded height and width (Hp, Wp). hw (Tuple): original height and width (H, W) before padding. @@ -1845,11 +1804,10 @@ def add_decomposed_rel_pos( ) -> torch.Tensor: """ Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. - https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950 Args: attn (Tensor): attention map. q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). - rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis. + rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis. q_size (Tuple): spatial sequence size of query q with (q_h, q_w). k_size (Tuple): spatial sequence size of key k with (k_h, k_w). @@ -2060,7 +2018,8 @@ def feature_select(self, image_forward_outs): if self.select_feature == "patch": # if the output has cls_token image_features = image_features[:, 1:] - elif self.select_feature == "cls_patch" or self.select_feature == "same": + elif (self.select_feature == "cls_patch" + or self.select_feature == "same"): image_features = image_features else: raise ValueError( @@ -2164,7 +2123,7 @@ def forward(self, images: torch.Tensor): else: raise ValueError( - "Currently only support `feature`, `sequence`, `add` and `tuple` concat type." + "Currently only support `feature`, `sequence`, `add` and `tuple` concat type." # noqa ) return images_features diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 98df9875f19e7..60d5a8a20a36c 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -6,9 +6,9 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, + DeepSeekMultiModalityConfig, JAISConfig, MLPSpeculatorConfig, - MPTConfig, RWConfig, - DeepSeekMultiModalityConfig) + MPTConfig, RWConfig) if VLLM_USE_MODELSCOPE: from modelscope import AutoConfig diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index f79de04a5ad06..7de695a7b6022 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -1,5 +1,7 @@ from vllm.transformers_utils.configs.chatglm import ChatGLMConfig from vllm.transformers_utils.configs.dbrx import DbrxConfig +from vllm.transformers_utils.configs.deepseek_vl import ( + DeepSeekMultiModalityConfig) # RWConfig is for the original tiiuae/falcon-40b(-instruct) and # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the # `FalconConfig` class from the official HuggingFace transformers library. @@ -7,7 +9,6 @@ from vllm.transformers_utils.configs.jais import JAISConfig from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig from vllm.transformers_utils.configs.mpt import MPTConfig -from vllm.transformers_utils.configs.deepseek_vl import DeepSeekMultiModalityConfig __all__ = [ "ChatGLMConfig", diff --git a/vllm/transformers_utils/configs/deepseek_vl.py b/vllm/transformers_utils/configs/deepseek_vl.py index b14d8cbdf5b3c..5a17a8c13b840 100644 --- a/vllm/transformers_utils/configs/deepseek_vl.py +++ b/vllm/transformers_utils/configs/deepseek_vl.py @@ -1,25 +1,23 @@ # Copyright (c) 2023-2024 DeepSeek. # -# Permission is hereby granted, free of charge, to any person obtaining a copy of +# Permission is hereby granted,free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in # the Software without restriction, including without limitation the rights to -# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -# the Software, and to permit persons to whom the Software is furnished to do so, +# use, copy, modify, merge, publish, distribute,sublicense,and/or sell copies of +# the Software,and to permit persons to whom the Software is furnished to do so, # subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# IMPLIED,INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,FITNESS # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -from transformers import AutoConfig -from transformers import PretrainedConfig -from transformers import LlamaConfig +from transformers import AutoConfig, LlamaConfig, PretrainedConfig class VisionConfig(PretrainedConfig): From 78612619a56e3f1e3d18aa00ba8b3838990c3c6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Tue, 2 Jul 2024 15:46:49 +0800 Subject: [PATCH 13/47] add test case and fix update --- tests/models/test_deepseek_vl.py | 268 ++++++++++++++++++++++ vllm/model_executor/models/deepseek_vl.py | 56 ++++- 2 files changed, 318 insertions(+), 6 deletions(-) create mode 100644 tests/models/test_deepseek_vl.py diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py new file mode 100644 index 0000000000000..eb5d40d26b304 --- /dev/null +++ b/tests/models/test_deepseek_vl.py @@ -0,0 +1,268 @@ +from typing import List, Tuple + +import pytest + +import torch +from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM + +from transformers import LlamaForCausalLM +from transformers import AutoTokenizer + +from vllm.config import VisionLanguageConfig + +from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets + +from vllm.model_executor.models.deepseek_vl import ( + model_name_to_cls, + MultiModalityPreTrainedModel, + VLMImageProcessor, +) +from vllm.transformers_utils.config import DeepSeekMultiModalityConfig + + + +pytestmark = pytest.mark.vlm + +# The image token is placed before "user" on purpose so that the test can pass +HF_IMAGE_PROMPTS = [ + "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: What's the content of the image?\nAssistant:", + "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: What is the season?\nAssistant:", +] + + + +class MultiModalityCausalLM(MultiModalityPreTrainedModel): + def __init__(self, config: DeepSeekMultiModalityConfig): + super().__init__(config) + + vision_config = config.vision_config + vision_cls = model_name_to_cls(vision_config.cls) + self.vision_model = vision_cls(**vision_config.params) + + aligner_config = config.aligner_config + aligner_cls = model_name_to_cls(aligner_config.cls) + self.aligner = aligner_cls(aligner_config.params) + + language_config = config.language_config + self.language_model = LlamaForCausalLM(language_config) + + def prepare_inputs_embeds( + self, + input_ids: torch.LongTensor, + pixel_values: torch.FloatTensor, + images_seq_mask: torch.LongTensor, + **kwargs, + ): + """ + + Args: + input_ids (torch.LongTensor): [b, T] + pixel_values (torch.FloatTensor): [b, n_images, 3, h, w] + images_seq_mask (torch.BoolTensor): [b, T] + + assert torch.sum(images_seq_mask) == torch.sum(images_emb_mask) + + Returns: + input_embeds (torch.Tensor): [b, T, D] + """ + + bs, n = pixel_values.shape[0:2] + p_b, p_n, p_c, p_h, p_w = pixel_values.shape + images = pixel_values.reshape(p_b * p_n, p_c, p_h, p_w) + images_embeds = self.aligner(self.vision_model(images)) + + # [b x n, T2, D] -> [b, n x T2, D] + _, t, d = images_embeds.shape + images_embeds = images_embeds.reshape(bs, n * t, d) + + # [b, T, D] + input_ids[input_ids < 0] = 0 # ignore the image embeddings + inputs_embeds = self.language_model.get_input_embeddings()( + input_ids + ).reshape(1, -1, 4096) + + # replace with the image embeddings + images_embeds = images_embeds.reshape( + 1, -1, self.config.aligner_config.params["n_embed"] + ) + inputs_embeds[images_seq_mask] = images_embeds + + return inputs_embeds + + +def get_input(tokenizer, prompt, image): + + image_id = 100015 + vl_image = VLMImageProcessor(1024) + input_ids = tokenizer.encode(prompt) + input_ids = torch.LongTensor(input_ids) + image_token_mask = input_ids == image_id + images_outputs = vl_image(image, return_tensors="pt") + images_emb_mask = torch.ones(1, 1, 576) == 1 + prepare = { + "sft_format": prompt, + "input_ids": input_ids.to("cuda"), + "pixel_values": images_outputs.pixel_values.to(torch.bfloat16) + .to("cuda") + .reshape(1, -1, 3, 1024, 1024), + "num_image_tokens": 576, + "images_seq_mask": image_token_mask.to("cuda").reshape(1, -1), + "images_emb_mask": images_emb_mask.to("cuda"), + "attention_mask": torch.ones(1, len(input_ids)).to("cuda"), + } + return prepare + + +def iter_llava_configs(model_name: str): + image_hw_to_feature_size = { + (1024, 1024): 576, + } + + for (h, w), f in image_hw_to_feature_size.items(): + for input_type, input_shape in [ + (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)), + (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)), + ]: + yield ( + model_name, + VisionLanguageConfig( + image_input_type=input_type, + image_feature_size=f, + image_token_id=100015, + image_input_shape=input_shape, + image_processor=model_name, + image_processor_revision=None, + ), + ) + + +model_and_vl_config = [ + *iter_llava_configs("deepseek-ai/deepseek-vl-7b-chat"), +] + + +def vllm_to_hf_output( + vllm_output: Tuple[List[int], str], + vlm_config: VisionLanguageConfig, + model_id: str, +): + """Sanitize vllm output to be comparable with hf output. + The function reduces `input_ids` from 1, 32000, 32000, ..., 32000, + x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... + It also reduces `output_str` from "bla" to "bla". + """ + input_ids, output_str = vllm_output + image_token_id = vlm_config.image_token_id + + tokenizer = AutoTokenizer.from_pretrained(model_id) + image_token_str = tokenizer.decode(image_token_id) + + hf_input_ids = [ + input_id + for idx, input_id in enumerate(input_ids) + if input_id != image_token_id or input_ids[idx - 1] != image_token_id + ] + hf_output_str = output_str.replace( + image_token_str * vlm_config.image_feature_size, "" + ) + + return hf_input_ids, hf_output_str + + +# TODO: Add test for `tensor_parallel_size` [ref: PR #3883] +@pytest.mark.parametrize("model_and_config", model_and_vl_config) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [128]) +def test_models( + hf_runner, + vllm_runner, + hf_images, + vllm_images, + model_and_config, + dtype: str, + max_tokens: int, +) -> None: + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test is under tests/images. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalData objects and corresponding + vision language config as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + model_id, vlm_config = model_and_config + + vllm_image_prompts = [ + p.replace( + "", + "" * vlm_config.image_feature_size, + ) + for p in HF_IMAGE_PROMPTS + ] + + with vllm_runner( + model_id, + dtype=dtype, + enforce_eager=True, + **vlm_config.as_cli_args_dict(), + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy( + vllm_image_prompts, max_tokens, images=vllm_images + ) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + AutoModelForCausalLM.register( + DeepSeekMultiModalityConfig, MultiModalityCausalLM + ) + with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: + prepare_input_one = get_input( + tokenizer, + HF_IMAGE_PROMPTS[0].replace( + "", "" * 576 + ), + hf_images, + ) + prepare_input_two = get_input( + tokenizer, + HF_IMAGE_PROMPTS[1].replace( + "", "" * 576 + ), + hf_images, + ) + prepare_input_one = hf_model.prepare_inputs_embeds(**prepare_input_one) + prepare_input_two = hf_model.prepare_inputs_embeds(**prepare_input_two) + prepare_input = torch.concat(prepare_input_one, prepare_input_two) + attention_mask = torch.concat( + prepare_input_one["attention_mask"], + prepare_input_two["attention_mask"], + ) + hf_outputs = hf_model.generate_greedy( + HF_IMAGE_PROMPTS, + max_tokens, + images=hf_images, + inputs_embeds=prepare_input, + attention_mask=attention_mask, + pad_token_id=tokenizer.eos_token_id, + bos_token_id=tokenizer.bos_token_id, + eos_token_id=tokenizer.eos_token_id, + do_sample=False, + use_cache=True, + ) + + + + + + for i in range(len(HF_IMAGE_PROMPTS)): + hf_output_ids, hf_output_str = hf_outputs[i] + vllm_output_ids, vllm_output_str = vllm_to_hf_output( + vllm_outputs[i], vlm_config, model_id + ) + assert ( + hf_output_str == vllm_output_str + ), f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}" + assert ( + hf_output_ids == vllm_output_ids + ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}" diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py index 27fd994be94d2..16e6a10c95530 100644 --- a/vllm/model_executor/models/deepseek_vl.py +++ b/vllm/model_executor/models/deepseek_vl.py @@ -44,7 +44,7 @@ from transformers.image_utils import to_numpy_array from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, VisionLanguageConfig +from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) @@ -53,13 +53,12 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.image import get_dummy_image_data -from vllm.sequence import SamplerOutput +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData +from vllm.multimodal.base import VisionLanguageModelBase +from vllm.multimodal.image import ImageFeatureData, ImagePixelData +from vllm.sequence import SamplerOutput, SequenceData from vllm.transformers_utils.configs import DeepSeekMultiModalityConfig -from .vlm_base import VisionLanguageModelBase - ImageType = Union[np.ndarray, torch.Tensor, Image.Image] IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073) IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711) @@ -68,6 +67,51 @@ LayerType = Union[str, Callable, Type[torch.nn.Module]] +def _get_dummy_seq_data(seq_len: int, + vlm_config: VisionLanguageConfig) -> SequenceData: + # NOTE: We assume that token is repeated `image_feature_size` times + # and then concatenated with the text prompt + # TODO: Enable other ways of inserting the image into the prompt + + token_ids = [vlm_config.image_token_id] * vlm_config.image_feature_size + token_ids += [0] * (seq_len - vlm_config.image_feature_size) + + return SequenceData(token_ids) + + +def _get_dummy_values(vlm_config: VisionLanguageConfig) -> torch.Tensor: + if vlm_config.image_processor is None: + values_dtype = torch.float16 + else: + values_dtype = torch.uint8 + + return torch.zeros(vlm_config.image_input_shape, dtype=values_dtype) + + +def get_dummy_image_data( + seq_len: int, + model_config: ModelConfig, + vlm_config: VisionLanguageConfig, +) -> Tuple[SequenceData, MultiModalData]: + """Standard dummy data factory for image data (to be used in + :meth:`vlm.multimodal.MultiModalRegistry.register_dummy_data`).""" + seq_data = _get_dummy_seq_data(seq_len, vlm_config) + values = _get_dummy_values(vlm_config) + + config_input_type = vlm_config.image_input_type + ImageInputType = VisionLanguageConfig.ImageInputType + + fake_mm_data: MultiModalData + if config_input_type == ImageInputType.PIXEL_VALUES: + fake_mm_data = ImagePixelData(values) + elif config_input_type == ImageInputType.IMAGE_FEATURES: + fake_mm_data = ImageFeatureData(values) + else: + raise NotImplementedError + + return seq_data, fake_mm_data + + # From PyTorch internals def _ntuple(n): From 9b2e11677383c834b57d679089b1523d982e853c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Tue, 2 Jul 2024 16:08:47 +0800 Subject: [PATCH 14/47] uodate test case --- tests/models/test_deepseek_vl.py | 137 ++++++++++++++++++------------- 1 file changed, 81 insertions(+), 56 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index eb5d40d26b304..c585ede7d8b1f 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -1,4 +1,4 @@ -from typing import List, Tuple +from typing import List, Optional, Tuple, Type import pytest @@ -21,7 +21,6 @@ from vllm.transformers_utils.config import DeepSeekMultiModalityConfig - pytestmark = pytest.mark.vlm # The image token is placed before "user" on purpose so that the test can pass @@ -31,7 +30,6 @@ ] - class MultiModalityCausalLM(MultiModalityPreTrainedModel): def __init__(self, config: DeepSeekMultiModalityConfig): super().__init__(config) @@ -114,7 +112,7 @@ def get_input(tokenizer, prompt, image): return prepare -def iter_llava_configs(model_name: str): +def iter_deepseek_vl_configs(model_name: str): image_hw_to_feature_size = { (1024, 1024): 576, } @@ -138,7 +136,7 @@ def iter_llava_configs(model_name: str): model_and_vl_config = [ - *iter_llava_configs("deepseek-ai/deepseek-vl-7b-chat"), + *iter_deepseek_vl_configs("deepseek-ai/deepseek-vl-7b-chat"), ] @@ -174,14 +172,16 @@ def vllm_to_hf_output( @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) -def test_models( - hf_runner, - vllm_runner, - hf_images, - vllm_images, - model_and_config, +def run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + image_assets: _ImageAssets, + model_and_config: Tuple[str, VisionLanguageConfig], + *, dtype: str, max_tokens: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, ) -> None: """Inference result should be the same between hf and vllm. @@ -193,6 +193,7 @@ def test_models( The text output is sanitized to be able to compare with hf. """ model_id, vlm_config = model_and_config + hf_images = [asset.for_hf() for asset in image_assets] vllm_image_prompts = [ p.replace( @@ -202,59 +203,61 @@ def test_models( for p in HF_IMAGE_PROMPTS ] - with vllm_runner( - model_id, - dtype=dtype, - enforce_eager=True, - **vlm_config.as_cli_args_dict(), - ) as vllm_model: + with vllm_runner(model_id, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True, + **vlm_config.as_cli_args_dict()) as vllm_model: + vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets] vllm_outputs = vllm_model.generate_greedy( vllm_image_prompts, max_tokens, images=vllm_images ) - - tokenizer = AutoTokenizer.from_pretrained(model_id) AutoModelForCausalLM.register( DeepSeekMultiModalityConfig, MultiModalityCausalLM ) - with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: - prepare_input_one = get_input( - tokenizer, - HF_IMAGE_PROMPTS[0].replace( - "", "" * 576 - ), - hf_images, - ) - prepare_input_two = get_input( - tokenizer, - HF_IMAGE_PROMPTS[1].replace( - "", "" * 576 - ), - hf_images, - ) - prepare_input_one = hf_model.prepare_inputs_embeds(**prepare_input_one) - prepare_input_two = hf_model.prepare_inputs_embeds(**prepare_input_two) - prepare_input = torch.concat(prepare_input_one, prepare_input_two) - attention_mask = torch.concat( - prepare_input_one["attention_mask"], - prepare_input_two["attention_mask"], - ) - hf_outputs = hf_model.generate_greedy( - HF_IMAGE_PROMPTS, - max_tokens, - images=hf_images, - inputs_embeds=prepare_input, - attention_mask=attention_mask, - pad_token_id=tokenizer.eos_token_id, - bos_token_id=tokenizer.bos_token_id, - eos_token_id=tokenizer.eos_token_id, - do_sample=False, - use_cache=True, + tokenizer = AutoTokenizer.from_pretrained(model_id) + hf_model = AutoModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True + ) + hf_model = hf_model.to("cuda").eval() + prepare_input_one = get_input( + tokenizer, + HF_IMAGE_PROMPTS[0].replace( + "", "" * 576 + ), + hf_images, + ) + prepare_input_two = get_input( + tokenizer, + HF_IMAGE_PROMPTS[1].replace( + "", "" * 576 + ), + hf_images, + ) + prepare_input_one = hf_model.prepare_inputs_embeds(**prepare_input_one) + prepare_input_two = hf_model.prepare_inputs_embeds(**prepare_input_two) + prepare_input = torch.concat(prepare_input_one, prepare_input_two) + attention_mask = torch.concat( + prepare_input_one["attention_mask"], + prepare_input_two["attention_mask"], + ) + outputs = hf_model.generate( + inputs_embeds=prepare_input, + attention_mask=attention_mask, + max_new_tokens=max_tokens, + pad_token_id=tokenizer.eos_token_id, + bos_token_id=tokenizer.bos_token_id, + eos_token_id=tokenizer.eos_token_id, + do_sample=False, + use_cache=True, + ) + hf_outputs = [] + for o in outputs: + hf_outputs.append( + o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True) ) - - - - for i in range(len(HF_IMAGE_PROMPTS)): hf_output_ids, hf_output_str = hf_outputs[i] vllm_output_ids, vllm_output_str = vllm_to_hf_output( @@ -266,3 +269,25 @@ def test_models( assert ( hf_output_ids == vllm_output_ids ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}" + + +@pytest.mark.parametrize("model_and_config", model_and_vl_config) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [128]) +def test_models( + hf_runner, + vllm_runner, + image_assets, + model_and_config, + dtype: str, + max_tokens: int, +) -> None: + run_test( + hf_runner, + vllm_runner, + image_assets, + model_and_config, + dtype=dtype, + max_tokens=max_tokens, + tensor_parallel_size=1, + ) From 80533ea6b16e6114b6be44aae6cef1cf8084af5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Tue, 2 Jul 2024 16:17:35 +0800 Subject: [PATCH 15/47] uodate test case and fix test bugs --- tests/models/test_deepseek_vl.py | 122 +++++++++++++++---------------- 1 file changed, 58 insertions(+), 64 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index c585ede7d8b1f..953aa365a71af 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -1,36 +1,33 @@ from typing import List, Optional, Tuple, Type import pytest - import torch -from transformers import AutoTokenizer -from transformers import AutoModelForCausalLM - -from transformers import LlamaForCausalLM -from transformers import AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM from vllm.config import VisionLanguageConfig - -from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets - from vllm.model_executor.models.deepseek_vl import ( - model_name_to_cls, - MultiModalityPreTrainedModel, - VLMImageProcessor, -) + MultiModalityPreTrainedModel, VLMImageProcessor, model_name_to_cls) from vllm.transformers_utils.config import DeepSeekMultiModalityConfig +from ..conftest import HfRunner, VllmRunner, _ImageAssets pytestmark = pytest.mark.vlm # The image token is placed before "user" on purpose so that the test can pass HF_IMAGE_PROMPTS = [ - "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: What's the content of the image?\nAssistant:", - "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: What is the season?\nAssistant:", + "You are a helpful language and vision assistant. You are able" \ + "to understand the visual content that the user provides, and assist " \ + "the user with a variety of tasks using natural language.\n User:" \ + " What's the content of the image?\nAssistant:", + "You are a helpful language and vision assistant. You are able to "\ + "understand the visual content that the user provides, and assist the "\ + "user with a variety of tasks using natural language.\n User: "\ + "What is the season?\nAssistant:", ] class MultiModalityCausalLM(MultiModalityPreTrainedModel): + def __init__(self, config: DeepSeekMultiModalityConfig): super().__init__(config) @@ -77,13 +74,11 @@ def prepare_inputs_embeds( # [b, T, D] input_ids[input_ids < 0] = 0 # ignore the image embeddings inputs_embeds = self.language_model.get_input_embeddings()( - input_ids - ).reshape(1, -1, 4096) + input_ids).reshape(1, -1, 4096) # replace with the image embeddings images_embeds = images_embeds.reshape( - 1, -1, self.config.aligner_config.params["n_embed"] - ) + 1, -1, self.config.aligner_config.params["n_embed"]) inputs_embeds[images_seq_mask] = images_embeds return inputs_embeds @@ -99,15 +94,21 @@ def get_input(tokenizer, prompt, image): images_outputs = vl_image(image, return_tensors="pt") images_emb_mask = torch.ones(1, 1, 576) == 1 prepare = { - "sft_format": prompt, - "input_ids": input_ids.to("cuda"), - "pixel_values": images_outputs.pixel_values.to(torch.bfloat16) - .to("cuda") - .reshape(1, -1, 3, 1024, 1024), - "num_image_tokens": 576, - "images_seq_mask": image_token_mask.to("cuda").reshape(1, -1), - "images_emb_mask": images_emb_mask.to("cuda"), - "attention_mask": torch.ones(1, len(input_ids)).to("cuda"), + "sft_format": + prompt, + "input_ids": + input_ids.to("cuda"), + "pixel_values": + images_outputs.pixel_values.to(torch.bfloat16).to("cuda").reshape( + 1, -1, 3, 1024, 1024), + "num_image_tokens": + 576, + "images_seq_mask": + image_token_mask.to("cuda").reshape(1, -1), + "images_emb_mask": + images_emb_mask.to("cuda"), + "attention_mask": + torch.ones(1, len(input_ids)).to("cuda"), } return prepare @@ -148,7 +149,8 @@ def vllm_to_hf_output( """Sanitize vllm output to be comparable with hf output. The function reduces `input_ids` from 1, 32000, 32000, ..., 32000, x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... - It also reduces `output_str` from "bla" to "bla". + It also reduces `output_str` from + "bla" to "bla". """ input_ids, output_str = vllm_output image_token_id = vlm_config.image_token_id @@ -157,13 +159,11 @@ def vllm_to_hf_output( image_token_str = tokenizer.decode(image_token_id) hf_input_ids = [ - input_id - for idx, input_id in enumerate(input_ids) + input_id for idx, input_id in enumerate(input_ids) if input_id != image_token_id or input_ids[idx - 1] != image_token_id ] hf_output_str = output_str.replace( - image_token_str * vlm_config.image_feature_size, "" - ) + image_token_str * vlm_config.image_feature_size, "") return hf_input_ids, hf_output_str @@ -199,40 +199,37 @@ def run_test( p.replace( "", "" * vlm_config.image_feature_size, - ) - for p in HF_IMAGE_PROMPTS + ) for p in HF_IMAGE_PROMPTS ] - with vllm_runner(model_id, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True, - **vlm_config.as_cli_args_dict()) as vllm_model: + with vllm_runner( + model_id, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True, + **vlm_config.as_cli_args_dict(), + ) as vllm_model: vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets] - vllm_outputs = vllm_model.generate_greedy( - vllm_image_prompts, max_tokens, images=vllm_images - ) - AutoModelForCausalLM.register( - DeepSeekMultiModalityConfig, MultiModalityCausalLM - ) + vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, + max_tokens, + images=vllm_images) + AutoModelForCausalLM.register(DeepSeekMultiModalityConfig, + MultiModalityCausalLM) tokenizer = AutoTokenizer.from_pretrained(model_id) - hf_model = AutoModelForCausalLM.from_pretrained( - model_id, trust_remote_code=True - ) + hf_model = AutoModelForCausalLM.from_pretrained(model_id, + trust_remote_code=True) hf_model = hf_model.to("cuda").eval() prepare_input_one = get_input( tokenizer, - HF_IMAGE_PROMPTS[0].replace( - "", "" * 576 - ), + HF_IMAGE_PROMPTS[0].replace("", + "" * 576), hf_images, ) prepare_input_two = get_input( tokenizer, - HF_IMAGE_PROMPTS[1].replace( - "", "" * 576 - ), + HF_IMAGE_PROMPTS[1].replace("", + "" * 576), hf_images, ) prepare_input_one = hf_model.prepare_inputs_embeds(**prepare_input_one) @@ -252,23 +249,20 @@ def run_test( do_sample=False, use_cache=True, ) - hf_outputs = [] + hf_outputs: List = [] for o in outputs: hf_outputs.append( - o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True) - ) + (o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True))) for i in range(len(HF_IMAGE_PROMPTS)): hf_output_ids, hf_output_str = hf_outputs[i] vllm_output_ids, vllm_output_str = vllm_to_hf_output( - vllm_outputs[i], vlm_config, model_id - ) + vllm_outputs[i], vlm_config, model_id) assert ( hf_output_str == vllm_output_str ), f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}" - assert ( - hf_output_ids == vllm_output_ids - ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}" + assert (hf_output_ids == vllm_output_ids + ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}" @pytest.mark.parametrize("model_and_config", model_and_vl_config) From 92c80841e7acfef35728af1cf0b03fb6087a93a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Wed, 3 Jul 2024 10:20:20 +0800 Subject: [PATCH 16/47] update test case and fix bugs --- tests/models/test_deepseek_vl.py | 8 +-- vllm/model_executor/models/deepseek_vl.py | 81 ++++++++--------------- 2 files changed, 29 insertions(+), 60 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 953aa365a71af..1c007f5772e1b 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -210,7 +210,7 @@ def run_test( enforce_eager=True, **vlm_config.as_cli_args_dict(), ) as vllm_model: - vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets] + vllm_images = [asset.for_vllm() for asset in image_assets] vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, max_tokens, images=vllm_images) @@ -222,14 +222,12 @@ def run_test( hf_model = hf_model.to("cuda").eval() prepare_input_one = get_input( tokenizer, - HF_IMAGE_PROMPTS[0].replace("", - "" * 576), + vllm_image_prompts[0], hf_images, ) prepare_input_two = get_input( tokenizer, - HF_IMAGE_PROMPTS[1].replace("", - "" * 576), + vllm_image_prompts[1], hf_images, ) prepare_input_one = hf_model.prepare_inputs_embeds(**prepare_input_one) diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py index 16e6a10c95530..867202ca088af 100644 --- a/vllm/model_executor/models/deepseek_vl.py +++ b/vllm/model_executor/models/deepseek_vl.py @@ -44,7 +44,8 @@ from transformers.image_utils import to_numpy_array from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig +from vllm.config import CacheConfig, VisionLanguageConfig +from vllm.inputs import INPUT_REGISTRY, InputContext from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) @@ -53,12 +54,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData -from vllm.multimodal.base import VisionLanguageModelBase -from vllm.multimodal.image import ImageFeatureData, ImagePixelData -from vllm.sequence import SamplerOutput, SequenceData +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import DeepSeekMultiModalityConfig +from .clip import dummy_seq_data_for_clip +from .interfaces import SupportsVision + ImageType = Union[np.ndarray, torch.Tensor, Image.Image] IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073) IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711) @@ -67,51 +69,6 @@ LayerType = Union[str, Callable, Type[torch.nn.Module]] -def _get_dummy_seq_data(seq_len: int, - vlm_config: VisionLanguageConfig) -> SequenceData: - # NOTE: We assume that token is repeated `image_feature_size` times - # and then concatenated with the text prompt - # TODO: Enable other ways of inserting the image into the prompt - - token_ids = [vlm_config.image_token_id] * vlm_config.image_feature_size - token_ids += [0] * (seq_len - vlm_config.image_feature_size) - - return SequenceData(token_ids) - - -def _get_dummy_values(vlm_config: VisionLanguageConfig) -> torch.Tensor: - if vlm_config.image_processor is None: - values_dtype = torch.float16 - else: - values_dtype = torch.uint8 - - return torch.zeros(vlm_config.image_input_shape, dtype=values_dtype) - - -def get_dummy_image_data( - seq_len: int, - model_config: ModelConfig, - vlm_config: VisionLanguageConfig, -) -> Tuple[SequenceData, MultiModalData]: - """Standard dummy data factory for image data (to be used in - :meth:`vlm.multimodal.MultiModalRegistry.register_dummy_data`).""" - seq_data = _get_dummy_seq_data(seq_len, vlm_config) - values = _get_dummy_values(vlm_config) - - config_input_type = vlm_config.image_input_type - ImageInputType = VisionLanguageConfig.ImageInputType - - fake_mm_data: MultiModalData - if config_input_type == ImageInputType.PIXEL_VALUES: - fake_mm_data = ImagePixelData(values) - elif config_input_type == ImageInputType.IMAGE_FEATURES: - fake_mm_data = ImageFeatureData(values) - else: - raise NotImplementedError - - return seq_data, fake_mm_data - - # From PyTorch internals def _ntuple(n): @@ -2196,10 +2153,24 @@ class MultiModalityPreTrainedModel(PreTrainedModel): _skip_keys_device_placement = "past_key_values" -@MULTIMODAL_REGISTRY.register_image_feature_input() -@MULTIMODAL_REGISTRY.register_image_pixel_input() -@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data) -class DeepSeekMultiModalityCausalLM(VisionLanguageModelBase): +def dummy_data_for_deepseek(ctx: InputContext, seq_len: int): + hf_config = ctx.get_hf_config(DeepSeekMultiModalityConfig) + vision_config = hf_config.vision_config + image_size = vision_config.params.get("image_size") + if not image_size: + # Get image size for 7b model + image_size = vision_config.params["high_res_cfg"]["image_size"] + seq_data = dummy_seq_data_for_clip(vision_config, + seq_len, + image_token_id=100015, + image_feature_size_override=576) + mm_data = Image.new("RGB", (image_size, image_size), color=0) + return seq_data, mm_data + + +@MULTIMODAL_REGISTRY.register_image_input_mapper() +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_deepseek) +class DeepSeekMultiModalityCausalLM(nn.Module, SupportsVision): def __init__( self, @@ -2208,7 +2179,7 @@ def __init__( cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, ): - super().__init__(config, ) + super().__init__() self.config = config vision_config = config.vision_config aligner_config = config.aligner_config From d6452bb7967d55fbfa1a35fc5d8f93f4a220f0de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Wed, 3 Jul 2024 10:28:10 +0800 Subject: [PATCH 17/47] update test example --- examples/deepseek_vl_example.py | 68 ++++----------------------------- 1 file changed, 8 insertions(+), 60 deletions(-) diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py index 715464635d599..43d7fe55b6a01 100644 --- a/examples/deepseek_vl_example.py +++ b/examples/deepseek_vl_example.py @@ -1,18 +1,13 @@ -import argparse import os import subprocess -import torch from PIL import Image # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`. # You can use `.buildkite/download-images.sh` to download them from vllm import LLM, SamplingParams -from vllm.model_executor.models.deepseek_vl import VLMImageProcessor -from vllm.multimodal.image import ImageFeatureData, ImagePixelData sample_params = SamplingParams(temperature=0, max_tokens=1024) - model = "deepseek-ai/deepseek-vl-7b-chat" prompt = "You are a helpful language and vision assistant." \ "You are able to understand the visual content that the user provides," \ @@ -22,7 +17,7 @@ prompt = prompt.replace("", "" * 576) -def run_deepseek_vl_pixel_values(*, disable_image_processor: bool = False): +def run_deepseek_vl(*, disable_image_processor: bool = False): llm = LLM( model=model, image_input_type="pixel_values", @@ -35,15 +30,14 @@ def run_deepseek_vl_pixel_values(*, disable_image_processor: bool = False): enforce_eager=True, ) - if disable_image_processor: - image = get_image_features() - else: - image = Image.open("images/stop_sign.jpg") + image = Image.open("images/stop_sign.jpg") outputs = llm.generate( { "prompt": prompt, - "multi_modal_data": ImagePixelData(image), + "multi_modal_data": { + "image": image + }, }, sampling_params=sample_params, ) @@ -53,57 +47,11 @@ def run_deepseek_vl_pixel_values(*, disable_image_processor: bool = False): print(generated_text) -def run_deepseek_vl_image_features(): - llm = LLM( - model=model, - image_input_type="image_features", - image_token_id=100015, - image_input_shape="1,3,1024,1024", - image_feature_size=576, - gpu_memory_utilization=0.9, - max_model_len=3072, - enforce_eager=True, - ) - - image: torch.Tensor = get_image_features() - - outputs = llm.generate( - { - "prompt": prompt, - "multi_modal_data": ImageFeatureData(image), - }, - sampling_params=sample_params, - ) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - -def get_image_features(): - image_feature = VLMImageProcessor(1024)( - Image.open("images/stop_sign.jpg"))["pixel_values"] - torch.save(image_feature, "images/deepseek_vl_stop_sign.pt") - return torch.load("images/deepseek_vl_stop_sign.pt") - - -def main(args): - if args.type == "pixel_values": - run_deepseek_vl_pixel_values() - else: - run_deepseek_vl_image_features() +def main(): + run_deepseek_vl() if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Demo on deepseek-vl") - parser.add_argument( - "--type", - type=str, - choices=["pixel_values", "image_features"], - default="pixel_values", - help="image input type", - ) - args = parser.parse_args() # Download from s3 s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/" local_directory = "images" @@ -120,4 +68,4 @@ def main(args): local_directory, "--no-sign-request", ]) - main(args) + main() From f63a1a1a0e205a3bed5d3aab91c620f83e8b03b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Wed, 3 Jul 2024 11:36:13 +0800 Subject: [PATCH 18/47] fix test case --- tests/models/test_deepseek_vl.py | 41 +++++++++++++------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 1c007f5772e1b..f8b75d5eb5825 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -10,6 +10,7 @@ from vllm.transformers_utils.config import DeepSeekMultiModalityConfig from ..conftest import HfRunner, VllmRunner, _ImageAssets +from .utils import check_outputs_equal pytestmark = pytest.mark.vlm @@ -115,25 +116,15 @@ def get_input(tokenizer, prompt, image): def iter_deepseek_vl_configs(model_name: str): image_hw_to_feature_size = { - (1024, 1024): 576, + (336, 336): 576, } for (h, w), f in image_hw_to_feature_size.items(): - for input_type, input_shape in [ - (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)), - (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)), - ]: - yield ( - model_name, - VisionLanguageConfig( - image_input_type=input_type, - image_feature_size=f, - image_token_id=100015, - image_input_shape=input_shape, - image_processor=model_name, - image_processor_revision=None, - ), - ) + input_shape = (1, 3, h, w) + yield (model_name, + VisionLanguageConfig(image_feature_size=f, + image_token_id=100015, + image_input_shape=input_shape)) model_and_vl_config = [ @@ -252,15 +243,15 @@ def run_test( hf_outputs.append( (o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True))) - for i in range(len(HF_IMAGE_PROMPTS)): - hf_output_ids, hf_output_str = hf_outputs[i] - vllm_output_ids, vllm_output_str = vllm_to_hf_output( - vllm_outputs[i], vlm_config, model_id) - assert ( - hf_output_str == vllm_output_str - ), f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}" - assert (hf_output_ids == vllm_output_ids - ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}" + check_outputs_equal( + hf_outputs, + [ + vllm_to_hf_output(vllm_output, vlm_config, model_id) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("model_and_config", model_and_vl_config) From 1b90f47da0b83ebb7cc381b72fbd948645ae0ed4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Wed, 3 Jul 2024 18:07:39 +0800 Subject: [PATCH 19/47] Adaptation code update --- examples/deepseek_vl_example.py | 14 +++++++------- vllm/model_executor/models/deepseek_vl.py | 17 ++++++++--------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py index 43d7fe55b6a01..b1d18ceb2af65 100644 --- a/examples/deepseek_vl_example.py +++ b/examples/deepseek_vl_example.py @@ -8,23 +8,23 @@ from vllm import LLM, SamplingParams sample_params = SamplingParams(temperature=0, max_tokens=1024) -model = "deepseek-ai/deepseek-vl-7b-chat" -prompt = "You are a helpful language and vision assistant." \ - "You are able to understand the visual content that the user provides," \ - "and assist the user with a variety of tasks using natural language.\n" \ +model = "/pretrained_models/deepseek-vl-7b-chat" +prompt = ( + "You are a helpful language and vision assistant." + "You are able to understand the visual content that the user provides," + "and assist the user with a variety of tasks using natural language.\n" "User: Describe the content of this image.\nAssistant:" +) prompt = prompt.replace("", "" * 576) -def run_deepseek_vl(*, disable_image_processor: bool = False): +def run_deepseek_vl(): llm = LLM( model=model, - image_input_type="pixel_values", image_token_id=100015, image_input_shape="1,3,1024,1024", image_feature_size=576, - disable_image_processor=False, gpu_memory_utilization=0.9, max_model_len=3072, enforce_eager=True, diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py index 867202ca088af..3e10f2c0ee4db 100644 --- a/vllm/model_executor/models/deepseek_vl.py +++ b/vllm/model_executor/models/deepseek_vl.py @@ -2164,7 +2164,7 @@ def dummy_data_for_deepseek(ctx: InputContext, seq_len: int): seq_len, image_token_id=100015, image_feature_size_override=576) - mm_data = Image.new("RGB", (image_size, image_size), color=0) + mm_data = {"image": Image.new("RGB", (image_size, image_size), color=0)} return seq_data, mm_data @@ -2172,13 +2172,11 @@ def dummy_data_for_deepseek(ctx: InputContext, seq_len: int): @INPUT_REGISTRY.register_dummy_data(dummy_data_for_deepseek) class DeepSeekMultiModalityCausalLM(nn.Module, SupportsVision): - def __init__( - self, - config: DeepSeekMultiModalityConfig, - vision_language_config: VisionLanguageConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - ): + def __init__(self, + config: DeepSeekMultiModalityConfig, + vlm_config: VisionLanguageConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None): super().__init__() self.config = config vision_config = config.vision_config @@ -2284,6 +2282,7 @@ def forward( positions, kv_caches, attn_metadata, + None, inputs_embeds=inputs_embeds, ) @@ -2291,7 +2290,7 @@ def forward( def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head.weight, hidden_states, + logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) return logits From 026c92fbd2d19f332eaa4d4d825dea2bace66fa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Thu, 4 Jul 2024 15:37:16 +0800 Subject: [PATCH 20/47] Adaptation code update --- examples/deepseek_vl_example.py | 30 +++---- tests/models/test_deepseek_vl.py | 4 +- vllm/model_executor/models/deepseek_vl.py | 103 ++++++++++++++-------- 3 files changed, 80 insertions(+), 57 deletions(-) diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py index b1d18ceb2af65..84be2ebd5b397 100644 --- a/examples/deepseek_vl_example.py +++ b/examples/deepseek_vl_example.py @@ -1,6 +1,7 @@ import os import subprocess +import torch from PIL import Image # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`. @@ -8,27 +9,19 @@ from vllm import LLM, SamplingParams sample_params = SamplingParams(temperature=0, max_tokens=1024) -model = "/pretrained_models/deepseek-vl-7b-chat" -prompt = ( - "You are a helpful language and vision assistant." - "You are able to understand the visual content that the user provides," - "and assist the user with a variety of tasks using natural language.\n" +model = "deepseek-ai/deepseek-vl-7b-chat" +model = "deepseek-ai/deepseek-vl-1.3b-chat" +prompt = "You are a helpful language and vision assistant." \ + "You are able to understand the visual content that the user provides," \ + "and assist the user with a variety of tasks using natural language.\n" \ "User: Describe the content of this image.\nAssistant:" -) - -prompt = prompt.replace("", "" * 576) def run_deepseek_vl(): - llm = LLM( - model=model, - image_token_id=100015, - image_input_shape="1,3,1024,1024", - image_feature_size=576, - gpu_memory_utilization=0.9, - max_model_len=3072, - enforce_eager=True, - ) + llm = LLM(model=model, + max_model_len=3072, + enforce_eager=True, + dtype=torch.bfloat16) image = Image.open("images/stop_sign.jpg") @@ -39,8 +32,7 @@ def run_deepseek_vl(): "image": image }, }, - sampling_params=sample_params, - ) + sampling_params=sample_params) for o in outputs: generated_text = o.outputs[0].text diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index f8b75d5eb5825..53ca234b9f52f 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -244,8 +244,8 @@ def run_test( (o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True))) check_outputs_equal( - hf_outputs, - [ + outputs_0_lst=hf_outputs, + outputs_1_lst=[ vllm_to_hf_output(vllm_output, vlm_config, model_id) for vllm_output in vllm_outputs ], diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py index 3e10f2c0ee4db..ddf6ee22d09ff 100644 --- a/vllm/model_executor/models/deepseek_vl.py +++ b/vllm/model_executor/models/deepseek_vl.py @@ -44,8 +44,8 @@ from transformers.image_utils import to_numpy_array from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, VisionLanguageConfig -from vllm.inputs import INPUT_REGISTRY, InputContext +from vllm.config import CacheConfig, MultiModalConfig +from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) @@ -55,6 +55,8 @@ from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.image import (cached_get_tokenizer, + repeat_and_pad_image_tokens) from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import DeepSeekMultiModalityConfig @@ -67,6 +69,8 @@ IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5) LayerType = Union[str, Callable, Type[torch.nn.Module]] +IMAGE_FEATURE_SIZE = 576 +IMAGE_TOKEN_ID = 100015 # From PyTorch internals @@ -213,7 +217,7 @@ def drop_path( # From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py # noqa class DropPath(nn.Module): """ - Drop paths (Stochastic Depth) per sample + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """ @@ -390,7 +394,7 @@ def feat_ratio(self, as_scalar=True) -> Union[Tuple[int, int], int]: return self.patch_size def dynamic_feat_size(self, img_size: Tuple[int, int]) -> Tuple[int, int]: - """Get grid (feature) size for given image size taking account + """Get grid (feature) size for given image size taking account of dynamic padding. NOTE: must be torchscript compatible so using fixed tuple indexing """ @@ -754,7 +758,7 @@ def forward( Args: x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: if it is a tuple of torch.Tensor, - then it comes from the hybrid vision encoder, + then it comes from the hybrid vision encoder, and x = high_res_x, low_res_x); otherwise it is the feature from the single vision encoder. @@ -953,7 +957,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class VisionTransformer(nn.Module): """Vision Transformer - A PyTorch impl of : `An Image is Worth 16x16 Words: + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` - https://arxiv.org/abs/2010.11929 """ @@ -1000,20 +1004,20 @@ def __init__( patch_size: Patch size. in_chans: Number of image input channels. num_classes: Number of classes for classification head. - global_pool: Type of global pooling for final sequence + global_pool: Type of global pooling for final sequence (default: 'token'). embed_dim: Transformer embedding dimension. depth: Depth of transformer. num_heads: Number of attention heads. mlp_ratio: Ratio of mlp hidden dim to embedding dim. qkv_bias: Enable bias for qkv projections if True. - init_values: Layer-scale init values + init_values: Layer-scale init values (layer-scale enabled if not None). class_token: Use class token. - no_embed_class: Don't include position embeddings for class + no_embed_class: Don't include position embeddings for class (or reg) tokens. reg_tokens: Number of register tokens. - fc_norm: Pre head norm after pool (instead of before), if None, + fc_norm: Pre head norm after pool (instead of before), if None, enabled when global_pool == 'avg'. drop_rate: Head dropout rate. pos_drop_rate: Position embedding dropout rate. @@ -1429,11 +1433,11 @@ def __init__( norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. use_abs_pos (bool): If True, use absolute positional embeddings. - use_rel_pos (bool): If True, add relative positional embeddings to + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. - rel_pos_zero_init (bool): If True, zero initialize relative - positional parameters. window_size (int): Window size for window - attention blocks. global_attn_indexes (list): Indexes for blocks + rel_pos_zero_init (bool): If True, zero initialize relative + positional parameters. window_size (int): Window size for window + attention blocks. global_attn_indexes (list): Indexes for blocks using global attention. downsample_channels (list): Channels for downsampling layers. """ @@ -1553,7 +1557,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class Block(nn.Module): """ - Transformer blocks with support of window attention and + Transformer blocks with support of window attention and residual propagation blocks """ @@ -1575,17 +1579,17 @@ def __init__( dim (int): Number of input channels. num_heads (int): Number of attention heads in each ViT block. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. - qkv_bias (bool): If True, add a learnable bias to + qkv_bias (bool): If True, add a learnable bias to query, key, value. norm_layer (nn.Module): Normalization layer. act_layer (nn.Module): Activation layer. - use_rel_pos (bool): If True, add relative positional embeddings to + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. - rel_pos_zero_init (bool): If True, zero initialize relative + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. - window_size (int): Window size for window attention blocks. If - it equals 0, then use global attention. input_size - (tuple(int, int) or None): Input resolution for calculating + window_size (int): Window size for window attention blocks. If + it equals 0, then use global attention. input_size + (tuple(int, int) or None): Input resolution for calculating the relative positional parameter size. """ @@ -1643,13 +1647,13 @@ def __init__( Args: dim (int): Number of input channels. num_heads (int): Number of attention heads. - qkv_bias (bool): If True, add a learnable bias to + qkv_bias (bool): If True, add a learnable bias to query, key, value. - rel_pos (bool): If True, add relative positional embeddings + rel_pos (bool): If True, add relative positional embeddings to the attention map. - rel_pos_zero_init (bool): If True, zero initialize relative + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. - input_size (tuple(int, int) or None): Input resolution for + input_size (tuple(int, int) or None): Input resolution for calculating the relative positional parameter size. """ @@ -1738,7 +1742,7 @@ def window_unpartition( """ Window unpartition into original sequences and removing padding. Args: - windows (tensor): input tokens with + windows (tensor): input tokens with [B * num_windows, window_size, window_size, C]. window_size (int): window size. pad_hw (Tuple): padded height and width (Hp, Wp). @@ -2160,23 +2164,48 @@ def dummy_data_for_deepseek(ctx: InputContext, seq_len: int): if not image_size: # Get image size for 7b model image_size = vision_config.params["high_res_cfg"]["image_size"] - seq_data = dummy_seq_data_for_clip(vision_config, - seq_len, - image_token_id=100015, - image_feature_size_override=576) + seq_data = dummy_seq_data_for_clip( + vision_config, + seq_len, + image_token_id=IMAGE_TOKEN_ID, + image_feature_size_override=IMAGE_FEATURE_SIZE, + ) mm_data = {"image": Image.new("RGB", (image_size, image_size), color=0)} return seq_data, mm_data +def input_processor_for_deepseek(ctx: InputContext, llm_inputs: LLMInputs): + multi_modal_data = llm_inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return llm_inputs + model_config = ctx.model_config + tokenizer = cached_get_tokenizer(model_config.tokenizer) + new_prompt, new_token_ids = repeat_and_pad_image_tokens( + tokenizer, + llm_inputs.get("prompt"), + llm_inputs["prompt_token_ids"], + image_token_id=IMAGE_TOKEN_ID, + repeat_count=IMAGE_FEATURE_SIZE, + ) + return LLMInputs( + prompt_token_ids=new_token_ids, + prompt=new_prompt, + multi_modal_data=multi_modal_data, + ) + + @MULTIMODAL_REGISTRY.register_image_input_mapper() @INPUT_REGISTRY.register_dummy_data(dummy_data_for_deepseek) +@INPUT_REGISTRY.register_input_processor(input_processor_for_deepseek) class DeepSeekMultiModalityCausalLM(nn.Module, SupportsVision): - def __init__(self, - config: DeepSeekMultiModalityConfig, - vlm_config: VisionLanguageConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: DeepSeekMultiModalityConfig, + multimodal_config: MultiModalConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ): super().__init__() self.config = config vision_config = config.vision_config @@ -2264,7 +2293,9 @@ def forward( if image_features is not None and pixel_values is None: pixel_values = image_features if pixel_values is not None: - image_token_id = 100015 + target_dtype = self.lm_head.weight.dtype + pixel_values = pixel_values.to(target_dtype) + image_token_id = IMAGE_TOKEN_ID image_token_mask = input_ids == image_token_id inputs_embeds = self.prepare_inputs_embeds( input_ids, From ffa1cc7783e9a822acf94e3eebe7d2d4d069bcdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Fri, 5 Jul 2024 11:58:17 +0800 Subject: [PATCH 21/47] Update Test case --- tests/models/test_deepseek_vl.py | 226 +++++++++++++++---------------- 1 file changed, 112 insertions(+), 114 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 53ca234b9f52f..3438837033e25 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -4,14 +4,17 @@ import torch from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM -from vllm.config import VisionLanguageConfig +from vllm.multimodal.utils import rescale_image_size +from vllm.sequence import SampleLogprobs from vllm.model_executor.models.deepseek_vl import ( MultiModalityPreTrainedModel, VLMImageProcessor, model_name_to_cls) from vllm.transformers_utils.config import DeepSeekMultiModalityConfig -from ..conftest import HfRunner, VllmRunner, _ImageAssets -from .utils import check_outputs_equal +from tests.conftest import HfRunner, VllmRunner, _ImageAssets +from tests.models.utils import check_logprobs_close +models = ["deepseek-ai/deepseek-vl-7b-chat"] +IMAGE_TOKEN_ID = 100015 pytestmark = pytest.mark.vlm # The image token is placed before "user" on purpose so that the test can pass @@ -85,10 +88,33 @@ def prepare_inputs_embeds( return inputs_embeds +def vllm_to_hf_output(vllm_output: Tuple[List[int], str, + Optional[SampleLogprobs]], + model: str): + """Sanitize vllm output to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + + tokenizer = AutoTokenizer.from_pretrained(model) + eos_token_id = tokenizer.eos_token_id + + hf_output_ids = [ + token_id for idx, token_id in enumerate(output_ids) + if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID + ] + + assert output_str[0] == " " + hf_output_str = output_str[1:] + if hf_output_ids[-1] == eos_token_id: + hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) + + return hf_output_ids, hf_output_str, out_logprobs + + def get_input(tokenizer, prompt, image): image_id = 100015 vl_image = VLMImageProcessor(1024) + prompt.replace('', '' * 576) input_ids = tokenizer.encode(prompt) input_ids = torch.LongTensor(input_ids) image_token_mask = input_ids == image_id @@ -114,63 +140,16 @@ def get_input(tokenizer, prompt, image): return prepare -def iter_deepseek_vl_configs(model_name: str): - image_hw_to_feature_size = { - (336, 336): 576, - } - - for (h, w), f in image_hw_to_feature_size.items(): - input_shape = (1, 3, h, w) - yield (model_name, - VisionLanguageConfig(image_feature_size=f, - image_token_id=100015, - image_input_shape=input_shape)) - - -model_and_vl_config = [ - *iter_deepseek_vl_configs("deepseek-ai/deepseek-vl-7b-chat"), -] - - -def vllm_to_hf_output( - vllm_output: Tuple[List[int], str], - vlm_config: VisionLanguageConfig, - model_id: str, -): - """Sanitize vllm output to be comparable with hf output. - The function reduces `input_ids` from 1, 32000, 32000, ..., 32000, - x1, x2, x3 ... to 1, 32000, x1, x2, x3 ... - It also reduces `output_str` from - "bla" to "bla". - """ - input_ids, output_str = vllm_output - image_token_id = vlm_config.image_token_id - - tokenizer = AutoTokenizer.from_pretrained(model_id) - image_token_str = tokenizer.decode(image_token_id) - - hf_input_ids = [ - input_id for idx, input_id in enumerate(input_ids) - if input_id != image_token_id or input_ids[idx - 1] != image_token_id - ] - hf_output_str = output_str.replace( - image_token_str * vlm_config.image_feature_size, "") - - return hf_input_ids, hf_output_str - - -# TODO: Add test for `tensor_parallel_size` [ref: PR #3883] -@pytest.mark.parametrize("model_and_config", model_and_vl_config) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) def run_test( hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], image_assets: _ImageAssets, - model_and_config: Tuple[str, VisionLanguageConfig], + model: str, *, + size_factors: List[float], dtype: str, max_tokens: int, + num_logprobs: int, tensor_parallel_size: int, distributed_executor_backend: Optional[str] = None, ) -> None: @@ -183,53 +162,57 @@ def run_test( Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. """ - model_id, vlm_config = model_and_config - hf_images = [asset.for_hf() for asset in image_assets] - - vllm_image_prompts = [ - p.replace( - "", - "" * vlm_config.image_feature_size, - ) for p in HF_IMAGE_PROMPTS - ] + images = [asset.pil_image for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + + # max_model_len should be greater than image_feature_size + with vllm_runner(model, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: + vllm_outputs_per_image = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] - with vllm_runner( - model_id, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True, - **vlm_config.as_cli_args_dict(), - ) as vllm_model: - vllm_images = [asset.for_vllm() for asset in image_assets] - vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, - max_tokens, - images=vllm_images) AutoModelForCausalLM.register(DeepSeekMultiModalityConfig, MultiModalityCausalLM) - tokenizer = AutoTokenizer.from_pretrained(model_id) - hf_model = AutoModelForCausalLM.from_pretrained(model_id, + tokenizer = AutoTokenizer.from_pretrained(model) + hf_model = AutoModelForCausalLM.from_pretrained(model, trust_remote_code=True) hf_model = hf_model.to("cuda").eval() - prepare_input_one = get_input( - tokenizer, - vllm_image_prompts[0], - hf_images, - ) - prepare_input_two = get_input( - tokenizer, - vllm_image_prompts[1], - hf_images, - ) - prepare_input_one = hf_model.prepare_inputs_embeds(**prepare_input_one) - prepare_input_two = hf_model.prepare_inputs_embeds(**prepare_input_two) - prepare_input = torch.concat(prepare_input_one, prepare_input_two) + prepare_input_list = [] + inputs_embeds_list = [] + for prompts, images in inputs_per_image: + print(f'prompt: {prompts}') + print(f'images: {images}') + prepare_input = get_input( + tokenizer, + prompts, + images, + ) + prepare_input_list.append(prepare_input) + inputs_embeds_list.append( + hf_model.prepare_inputs_embeds(**prepare_input)) + + inputs_embeds = torch.concat(inputs_embeds_list) attention_mask = torch.concat( - prepare_input_one["attention_mask"], - prepare_input_two["attention_mask"], - ) + [x['attention_mask'] for x in prepare_input_list]) outputs = hf_model.generate( - inputs_embeds=prepare_input, + inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=max_tokens, pad_token_id=tokenizer.eos_token_id, @@ -239,38 +222,53 @@ def run_test( use_cache=True, ) hf_outputs: List = [] + for o in outputs: hf_outputs.append( (o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True))) - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, vlm_config, model_id) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model_and_config", model_and_vl_config) + for hf_outputs, vllm_outputs in zip(hf_outputs, vllm_outputs_per_image): + # TODO: Check whether using original CLIPVisionModel can improve + # consistency against HF + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, model) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + ) + print('END---->') + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) -def test_models( - hf_runner, - vllm_runner, - image_assets, - model_and_config, - dtype: str, - max_tokens: int, -) -> None: +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, + dtype: str, max_tokens: int, num_logprobs: int) -> None: run_test( hf_runner, vllm_runner, image_assets, - model_and_config, + model, + size_factors=size_factors, dtype=dtype, max_tokens=max_tokens, + num_logprobs=num_logprobs, tensor_parallel_size=1, ) From 17711727ed9fd623a75f066de20501a2c8c28123 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Fri, 5 Jul 2024 13:04:31 +0800 Subject: [PATCH 22/47] fix Test case --- tests/models/test_deepseek_vl.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 3438837033e25..ba671fdc70e42 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -124,18 +124,18 @@ def get_input(tokenizer, prompt, image): "sft_format": prompt, "input_ids": - input_ids.to("cuda"), + input_ids, "pixel_values": - images_outputs.pixel_values.to(torch.bfloat16).to("cuda").reshape( + images_outputs.pixel_values.to(torch.bfloat16).reshape( 1, -1, 3, 1024, 1024), "num_image_tokens": 576, "images_seq_mask": - image_token_mask.to("cuda").reshape(1, -1), + image_token_mask.reshape(1, -1), "images_emb_mask": - images_emb_mask.to("cuda"), + images_emb_mask, "attention_mask": - torch.ones(1, len(input_ids)).to("cuda"), + torch.ones(1, len(input_ids)), } return prepare @@ -193,7 +193,7 @@ def run_test( tokenizer = AutoTokenizer.from_pretrained(model) hf_model = AutoModelForCausalLM.from_pretrained(model, trust_remote_code=True) - hf_model = hf_model.to("cuda").eval() + hf_model = hf_model prepare_input_list = [] inputs_embeds_list = [] for prompts, images in inputs_per_image: From 9287c7de88d9dd0e33c633cccb9096c79846451b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Fri, 5 Jul 2024 13:52:30 +0800 Subject: [PATCH 23/47] Update Test case --- tests/models/test_deepseek_vl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index ba671fdc70e42..9e569d38abbba 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -113,6 +113,8 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def get_input(tokenizer, prompt, image): image_id = 100015 + prompt = prompt[0] + image = image[0] vl_image = VLMImageProcessor(1024) prompt.replace('', '' * 576) input_ids = tokenizer.encode(prompt) From 5ff27a67f4983777dd1efb9545f703c2049eda46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Fri, 5 Jul 2024 15:20:34 +0800 Subject: [PATCH 24/47] Add register_max_image_tokens --- vllm/model_executor/models/deepseek_vl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py index ddf6ee22d09ff..f79d8b03c1c75 100644 --- a/vllm/model_executor/models/deepseek_vl.py +++ b/vllm/model_executor/models/deepseek_vl.py @@ -2195,6 +2195,7 @@ def input_processor_for_deepseek(ctx: InputContext, llm_inputs: LLMInputs): @MULTIMODAL_REGISTRY.register_image_input_mapper() +@MULTIMODAL_REGISTRY.register_max_image_tokens(IMAGE_FEATURE_SIZE) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_deepseek) @INPUT_REGISTRY.register_input_processor(input_processor_for_deepseek) class DeepSeekMultiModalityCausalLM(nn.Module, SupportsVision): From 6c41130939baa2d4ec2815aa10414c71d6708e0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Fri, 5 Jul 2024 16:58:33 +0800 Subject: [PATCH 25/47] fix test case --- tests/models/test_deepseek_vl.py | 35 +++++++++++++------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 9e569d38abbba..986374dd8e7c5 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -4,14 +4,14 @@ import torch from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM -from vllm.multimodal.utils import rescale_image_size -from vllm.sequence import SampleLogprobs +from vllm import SamplingParams from vllm.model_executor.models.deepseek_vl import ( MultiModalityPreTrainedModel, VLMImageProcessor, model_name_to_cls) +from vllm.sequence import SampleLogprobs from vllm.transformers_utils.config import DeepSeekMultiModalityConfig -from tests.conftest import HfRunner, VllmRunner, _ImageAssets -from tests.models.utils import check_logprobs_close +from ..conftest import HfRunner, VllmRunner, _ImageAssets +from .utils import check_outputs_equal models = ["deepseek-ai/deepseek-vl-7b-chat"] IMAGE_TOKEN_ID = 100015 @@ -116,7 +116,7 @@ def get_input(tokenizer, prompt, image): prompt = prompt[0] image = image[0] vl_image = VLMImageProcessor(1024) - prompt.replace('', '' * 576) + prompt = prompt.replace('', '' * 576) input_ids = tokenizer.encode(prompt) input_ids = torch.LongTensor(input_ids) image_token_mask = input_ids == image_id @@ -148,7 +148,6 @@ def run_test( image_assets: _ImageAssets, model: str, *, - size_factors: List[float], dtype: str, max_tokens: int, num_logprobs: int, @@ -166,10 +165,8 @@ def run_test( """ images = [asset.pil_image for asset in image_assets] - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + inputs_per_image = [([prompt], [image]) + for image, prompt in zip(images, HF_IMAGE_PROMPTS)] # NOTE: take care of the order. run vLLM first, and then run HF. # vLLM needs a fresh new process without cuda initialization. @@ -177,6 +174,7 @@ def run_test( # will hurt multiprocessing backend with fork method (the default method). # max_model_len should be greater than image_feature_size + sample_params = SamplingParams(temperature=0) with vllm_runner(model, dtype=dtype, tensor_parallel_size=tensor_parallel_size, @@ -186,7 +184,8 @@ def run_test( vllm_model.generate_greedy_logprobs(prompts, max_tokens, num_logprobs=num_logprobs, - images=images) + images=images, + sampling_params=sample_params) for prompts, images in inputs_per_image ] @@ -229,15 +228,10 @@ def run_test( hf_outputs.append( (o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True))) - for hf_outputs, vllm_outputs in zip(hf_outputs, vllm_outputs_per_image): - # TODO: Check whether using original CLIPVisionModel can improve - # consistency against HF - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], + for hf_output, vllm_output in zip(hf_outputs, vllm_outputs_per_image): + check_outputs_equal( + outputs_0_lst=hf_output, + outputs_1_lst=vllm_output[:2], name_0="hf", name_1="vllm", ) @@ -268,7 +262,6 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, vllm_runner, image_assets, model, - size_factors=size_factors, dtype=dtype, max_tokens=max_tokens, num_logprobs=num_logprobs, From 0e1bc5be318beab3b4d1afcd3cdaf4747549ab3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Fri, 5 Jul 2024 17:44:39 +0800 Subject: [PATCH 26/47] fix test case --- tests/models/test_deepseek_vl.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 986374dd8e7c5..841ff56a07067 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -4,7 +4,6 @@ import torch from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM -from vllm import SamplingParams from vllm.model_executor.models.deepseek_vl import ( MultiModalityPreTrainedModel, VLMImageProcessor, model_name_to_cls) from vllm.sequence import SampleLogprobs @@ -174,7 +173,6 @@ def run_test( # will hurt multiprocessing backend with fork method (the default method). # max_model_len should be greater than image_feature_size - sample_params = SamplingParams(temperature=0) with vllm_runner(model, dtype=dtype, tensor_parallel_size=tensor_parallel_size, @@ -184,8 +182,7 @@ def run_test( vllm_model.generate_greedy_logprobs(prompts, max_tokens, num_logprobs=num_logprobs, - images=images, - sampling_params=sample_params) + images=images) for prompts, images in inputs_per_image ] From 3b3b8eceaff72e5838b7fee5595675b9e8b7424b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Mon, 8 Jul 2024 10:30:41 +0800 Subject: [PATCH 27/47] fix test dtype error --- tests/models/test_deepseek_vl.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 841ff56a07067..d4a3835473775 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -109,7 +109,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, return hf_output_ids, hf_output_str, out_logprobs -def get_input(tokenizer, prompt, image): +def get_input(tokenizer, prompt, image, dtype): image_id = 100015 prompt = prompt[0] @@ -127,8 +127,7 @@ def get_input(tokenizer, prompt, image): "input_ids": input_ids, "pixel_values": - images_outputs.pixel_values.to(torch.bfloat16).reshape( - 1, -1, 3, 1024, 1024), + images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 1024, 1024), "num_image_tokens": 576, "images_seq_mask": @@ -191,17 +190,21 @@ def run_test( tokenizer = AutoTokenizer.from_pretrained(model) hf_model = AutoModelForCausalLM.from_pretrained(model, trust_remote_code=True) + dtype_dict = { + 'float16': torch.float16, + 'half': torch.bfloat16, + 'float32': torch.float32, + 'auto': hf_model.dtype + } + dtype = dtype_dict.get(dtype, hf_model.dtype) + hf_model = hf_model.to(dtype) hf_model = hf_model prepare_input_list = [] inputs_embeds_list = [] for prompts, images in inputs_per_image: print(f'prompt: {prompts}') print(f'images: {images}') - prepare_input = get_input( - tokenizer, - prompts, - images, - ) + prepare_input = get_input(tokenizer, prompts, images, dtype) prepare_input_list.append(prepare_input) inputs_embeds_list.append( hf_model.prepare_inputs_embeds(**prepare_input)) From f2f29d1bbd122b0077ad2345129779cacbefd848 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Mon, 8 Jul 2024 10:37:02 +0800 Subject: [PATCH 28/47] fix test dtype error --- tests/models/test_deepseek_vl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index d4a3835473775..77c7277b061fa 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -191,6 +191,7 @@ def run_test( hf_model = AutoModelForCausalLM.from_pretrained(model, trust_remote_code=True) dtype_dict = { + 'bfloat16': torch.bfloat16, 'float16': torch.float16, 'half': torch.bfloat16, 'float32': torch.float32, From 03133bbc3c20030c03479ae6ddbb52969a3fac55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Mon, 8 Jul 2024 15:16:11 +0800 Subject: [PATCH 29/47] update doc and fix test error --- docs/source/models/supported_models.rst | 8 ++-- tests/models/test_deepseek_vl.py | 60 ++++++++++++------------- 2 files changed, 32 insertions(+), 36 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index fefda5d39e7a1..33e8b261f00ed 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -161,10 +161,6 @@ Decoder-only Language Models - Xverse - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc. - - * - :code:`DeepSeekMultiModalityCausalLM` - - deepseek-ai - - :code:`deepseek-ai/deepseek-vl-1.3b-chat`, :code:`deepseek-ai/deepseek-vl-7b-chat`, etc. - - .. note:: Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. @@ -198,6 +194,10 @@ Vision Language Models - Phi-3-Vision - :code:`microsoft/Phi-3-vision-128k-instruct`, etc. - + * - :code:`DeepSeekMultiModalityCausalLM` + - deepseek-ai + - :code:`deepseek-ai/deepseek-vl-1.3b-chat`, :code:`deepseek-ai/deepseek-vl-7b-chat`, etc. + - If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 77c7277b061fa..b96561e0d0cc8 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -199,43 +199,39 @@ def run_test( } dtype = dtype_dict.get(dtype, hf_model.dtype) hf_model = hf_model.to(dtype) - hf_model = hf_model - prepare_input_list = [] - inputs_embeds_list = [] + hf_outputs: List = [] for prompts, images in inputs_per_image: print(f'prompt: {prompts}') print(f'images: {images}') prepare_input = get_input(tokenizer, prompts, images, dtype) - prepare_input_list.append(prepare_input) - inputs_embeds_list.append( - hf_model.prepare_inputs_embeds(**prepare_input)) - - inputs_embeds = torch.concat(inputs_embeds_list) - attention_mask = torch.concat( - [x['attention_mask'] for x in prepare_input_list]) - outputs = hf_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - max_new_tokens=max_tokens, - pad_token_id=tokenizer.eos_token_id, - bos_token_id=tokenizer.bos_token_id, - eos_token_id=tokenizer.eos_token_id, - do_sample=False, - use_cache=True, - ) - hf_outputs: List = [] - - for o in outputs: - hf_outputs.append( - (o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True))) - - for hf_output, vllm_output in zip(hf_outputs, vllm_outputs_per_image): - check_outputs_equal( - outputs_0_lst=hf_output, - outputs_1_lst=vllm_output[:2], - name_0="hf", - name_1="vllm", + attention_mask = prepare_input['attention_mask'] + inputs_embeds = hf_model.prepare_inputs_embeds(**prepare_input) + outputs = hf_model.language_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + max_new_tokens=max_tokens, + pad_token_id=tokenizer.eos_token_id, + bos_token_id=tokenizer.bos_token_id, + eos_token_id=tokenizer.eos_token_id, + do_sample=False, + use_cache=True, ) + for o in outputs: + hf_outputs.append((o.cpu().tolist(), + tokenizer.decode(o.cpu().tolist(), + skip_special_tokens=True))) + vllm_outputs_list = [] + for vllm_outputs in vllm_outputs_per_image: + vllm_outputs_list.append([ + vllm_to_hf_output(vllm_output, model) + for vllm_output in vllm_outputs + ][:2]) + print(f'hf_outputs --> {hf_outputs}') + print(f'vllm_outputs --> {vllm_outputs_list}') + check_outputs_equal(outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs_list, + name_0='hf', + name_1='vllm') print('END---->') From ff6c75816aa51e1494e95634f9f7e10a51a86732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Mon, 8 Jul 2024 15:25:40 +0800 Subject: [PATCH 30/47] fix mypy error --- tests/models/test_deepseek_vl.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index b96561e0d0cc8..20771f7027acf 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -222,10 +222,11 @@ def run_test( skip_special_tokens=True))) vllm_outputs_list = [] for vllm_outputs in vllm_outputs_per_image: - vllm_outputs_list.append([ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ][:2]) + vllm_outputs_list.append( + tuple([ + vllm_to_hf_output(vllm_output, model) + for vllm_output in vllm_outputs + ][:2])) print(f'hf_outputs --> {hf_outputs}') print(f'vllm_outputs --> {vllm_outputs_list}') check_outputs_equal(outputs_0_lst=hf_outputs, From 9d1f68e48ca8e7b4808d9d225acac1637363552d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Tue, 9 Jul 2024 09:24:51 +0800 Subject: [PATCH 31/47] use 1.3b model --- tests/models/test_deepseek_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 20771f7027acf..75c1ddc8b4f91 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -12,7 +12,7 @@ from ..conftest import HfRunner, VllmRunner, _ImageAssets from .utils import check_outputs_equal -models = ["deepseek-ai/deepseek-vl-7b-chat"] +models = ["deepseek-ai/deepseek-vl-1.3b-chat"] IMAGE_TOKEN_ID = 100015 pytestmark = pytest.mark.vlm From 2025c39c87894c05251639e99688a649f8593f0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Tue, 9 Jul 2024 10:10:51 +0800 Subject: [PATCH 32/47] use 1.3b model --- tests/models/test_deepseek_vl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 75c1ddc8b4f91..2f21278763bd8 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -114,7 +114,7 @@ def get_input(tokenizer, prompt, image, dtype): image_id = 100015 prompt = prompt[0] image = image[0] - vl_image = VLMImageProcessor(1024) + vl_image = VLMImageProcessor(384) prompt = prompt.replace('', '' * 576) input_ids = tokenizer.encode(prompt) input_ids = torch.LongTensor(input_ids) @@ -184,7 +184,7 @@ def run_test( images=images) for prompts, images in inputs_per_image ] - + print(f'vllm_outputs_per_image -> {vllm_outputs_per_image}') AutoModelForCausalLM.register(DeepSeekMultiModalityConfig, MultiModalityCausalLM) tokenizer = AutoTokenizer.from_pretrained(model) From 773aec876ddb6adc001f6846c6bfd69805bb3506 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Tue, 9 Jul 2024 10:56:43 +0800 Subject: [PATCH 33/47] use 1.3b model --- tests/models/test_deepseek_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 2f21278763bd8..dfe4c0f3c5674 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -127,7 +127,7 @@ def get_input(tokenizer, prompt, image, dtype): "input_ids": input_ids, "pixel_values": - images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 1024, 1024), + images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 384, 384), "num_image_tokens": 576, "images_seq_mask": From ee5a3db135b41b063d607636f7bb4507eab5df77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Tue, 9 Jul 2024 16:09:15 +0800 Subject: [PATCH 34/47] use 1.3b model --- tests/models/test_deepseek_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index dfe4c0f3c5674..fbc514a2d78e7 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -77,7 +77,7 @@ def prepare_inputs_embeds( # [b, T, D] input_ids[input_ids < 0] = 0 # ignore the image embeddings inputs_embeds = self.language_model.get_input_embeddings()( - input_ids).reshape(1, -1, 4096) + input_ids).reshape(1, -1, 2048) # replace with the image embeddings images_embeds = images_embeds.reshape( From 71ea404b602390dfc6523f2dad77429122e97c5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Tue, 9 Jul 2024 17:10:04 +0800 Subject: [PATCH 35/47] use gpu --- tests/models/test_deepseek_vl.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index fbc514a2d78e7..09db2367c1cf5 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -109,7 +109,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, return hf_output_ids, hf_output_str, out_logprobs -def get_input(tokenizer, prompt, image, dtype): +def get_input(tokenizer, prompt, image, dtype, device): image_id = 100015 prompt = prompt[0] @@ -125,17 +125,17 @@ def get_input(tokenizer, prompt, image, dtype): "sft_format": prompt, "input_ids": - input_ids, + input_ids.to(device), "pixel_values": - images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 384, 384), + images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 384, 384).to(device), "num_image_tokens": 576, "images_seq_mask": - image_token_mask.reshape(1, -1), + image_token_mask.reshape(1, -1).to(device), "images_emb_mask": - images_emb_mask, + images_emb_mask.to(device), "attention_mask": - torch.ones(1, len(input_ids)), + torch.ones(1, len(input_ids)).to(device), } return prepare @@ -190,6 +190,7 @@ def run_test( tokenizer = AutoTokenizer.from_pretrained(model) hf_model = AutoModelForCausalLM.from_pretrained(model, trust_remote_code=True) + device = 'cuda' dtype_dict = { 'bfloat16': torch.bfloat16, 'float16': torch.float16, @@ -198,7 +199,7 @@ def run_test( 'auto': hf_model.dtype } dtype = dtype_dict.get(dtype, hf_model.dtype) - hf_model = hf_model.to(dtype) + hf_model = hf_model.to(dtype).to(device) hf_outputs: List = [] for prompts, images in inputs_per_image: print(f'prompt: {prompts}') From 48862caf429ae75d29bbd21695aed8763d9a7df7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Tue, 9 Jul 2024 17:15:36 +0800 Subject: [PATCH 36/47] use gpu --- tests/models/test_deepseek_vl.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 09db2367c1cf5..fbac01f979a3b 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -127,7 +127,8 @@ def get_input(tokenizer, prompt, image, dtype, device): "input_ids": input_ids.to(device), "pixel_values": - images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 384, 384).to(device), + images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 384, + 384).to(device), "num_image_tokens": 576, "images_seq_mask": @@ -204,7 +205,7 @@ def run_test( for prompts, images in inputs_per_image: print(f'prompt: {prompts}') print(f'images: {images}') - prepare_input = get_input(tokenizer, prompts, images, dtype) + prepare_input = get_input(tokenizer, prompts, images, dtype, device) attention_mask = prepare_input['attention_mask'] inputs_embeds = hf_model.prepare_inputs_embeds(**prepare_input) outputs = hf_model.language_model.generate( From 348064acb180060bcf935603018939bef441fb6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Fri, 12 Jul 2024 10:47:29 +0800 Subject: [PATCH 37/47] fix Conflicting files and update test --- requirements-common.txt | 5 +- tests/models/test_deepseek_vl.py | 277 ++++++++++++++++++++---------- vllm/transformers_utils/config.py | 8 +- 3 files changed, 198 insertions(+), 92 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index 5435707bb5c6b..9521b80d7efe9 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -18,6 +18,7 @@ prometheus_client >= 0.18.0 prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer == 0.10.1 -outlines >= 0.0.43 # Requires torch >= 2.1.0 +outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0 typing_extensions -filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 \ No newline at end of file +filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 +pyzmq \ No newline at end of file diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index fbac01f979a3b..d31aa0ad4e0ea 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -1,18 +1,22 @@ from typing import List, Optional, Tuple, Type +from dataclasses import dataclass import pytest import torch -from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM +from transformers import AutoTokenizer, LlamaForCausalLM, AutoModelForVision2Seq +from transformers import LlamaTokenizerFast +from transformers.processing_utils import ProcessorMixin from vllm.model_executor.models.deepseek_vl import ( MultiModalityPreTrainedModel, VLMImageProcessor, model_name_to_cls) from vllm.sequence import SampleLogprobs +from vllm.multimodal.utils import rescale_image_size from vllm.transformers_utils.config import DeepSeekMultiModalityConfig from ..conftest import HfRunner, VllmRunner, _ImageAssets -from .utils import check_outputs_equal +from .utils import check_logprobs_close -models = ["deepseek-ai/deepseek-vl-1.3b-chat"] +models = ["/deepseek-ai/deepseek-vl-1.3b-chat"] IMAGE_TOKEN_ID = 100015 pytestmark = pytest.mark.vlm @@ -29,6 +33,39 @@ ] +class DictOutput(object): + + def keys(self): + return self.__dict__.keys() + + def __getitem__(self, item): + return self.__dict__[item] + + def __setitem__(self, key, value): + self.__dict__[key] = value + + +@dataclass +class VLChatProcessorOutput(DictOutput): + sft_format: List[str] + input_ids: torch.Tensor + pixel_values: torch.Tensor + attention_mask: torch.Tensor + images_seq_mask: torch.BoolTensor + images_emb_mask: torch.BoolTensor + + def __len__(self): + return len(self.input_ids) + + def to(self, device): + self.input_ids = self.input_ids.to(device) + self.attention_mask = self.attention_mask.to(device) + self.images_seq_mask = self.images_seq_mask.to(device) + self.images_emb_mask = self.images_emb_mask.to(device) + self.pixel_values = self.pixel_values.to(device=device) + return self + + class MultiModalityCausalLM(MultiModalityPreTrainedModel): def __init__(self, config: DeepSeekMultiModalityConfig): @@ -67,6 +104,7 @@ def prepare_inputs_embeds( bs, n = pixel_values.shape[0:2] p_b, p_n, p_c, p_h, p_w = pixel_values.shape + pixel_values = pixel_values.to(self.dtype) images = pixel_values.reshape(p_b * p_n, p_c, p_h, p_w) images_embeds = self.aligner(self.vision_model(images)) @@ -86,6 +124,121 @@ def prepare_inputs_embeds( return inputs_embeds + def generate(self, *args, **kwargs): + + sft_format = kwargs.pop('sft_format') + pixel_values = kwargs.pop('pixel_values') + images_seq_mask = kwargs.pop('images_seq_mask') + images_emb_mask = kwargs.pop('images_emb_mask') + input_ids = kwargs.pop('input_ids') + inputs_embeds = self.prepare_inputs_embeds(input_ids, pixel_values, + images_seq_mask) + tokenizer = AutoTokenizer.from_pretrained( + "/pretrained_models/deepseek-vl-1.3b-chat") + output = self.language_model.generate( + *args, + input_ids=input_ids, + inputs_embeds=inputs_embeds, + pad_token_id=tokenizer.eos_token_id, + bos_token_id=tokenizer.bos_token_id, + eos_token_id=tokenizer.eos_token_id, + **kwargs) + # output.sequences[0] = torch.concat([input_ids[0], output.sequences[0]]) + return output + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + +class VLChatProcessor(ProcessorMixin): + image_processor_class = "AutoImageProcessor" + tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast") + + attributes = ["image_processor", "tokenizer"] + + system_prompt = ( + "You are a helpful language and vision assistant. " + "You are able to understand the visual content that the user provides, " + "and assist the user with a variety of tasks using natural language.") + + def __init__( + self, + image_processor: VLMImageProcessor, + tokenizer: LlamaTokenizerFast, + image_tag: str = "", + num_image_tokens: int = 576, + add_special_token: bool = False, + sft_format: str = "deepseek", + mask_prompt: bool = True, + ignore_id: int = -100, + **kwargs, + ): + self.image_processor = image_processor + self.tokenizer = tokenizer + + image_id = self.tokenizer.vocab.get(image_tag) + if image_id is None: + special_tokens = [image_tag] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + print(f"Add image tag = {image_tag} to the tokenizer") + + self.image_tag = image_tag + self.num_image_tokens = num_image_tokens + self.add_special_token = add_special_token + self.sft_format = sft_format + self.mask_prompt = mask_prompt + self.ignore_id = ignore_id + self.image_id = image_id + + super().__init__( + image_processor, + tokenizer, + image_tag, + num_image_tokens, + add_special_token, + sft_format, + mask_prompt, + ignore_id, + **kwargs, + ) + + def __call__(self, *arg, **kwargs): + prompt = kwargs.pop('text') + image = kwargs.pop('images') + return VLChatProcessorOutput(**self.get_input(prompt, image)) + + def get_input(self, prompt, image): + prompt = prompt + image = image + prompt = prompt.replace(self.image_tag, + self.image_tag * self.num_image_tokens) + input_ids = self.tokenizer.encode(prompt) + input_ids = torch.LongTensor(input_ids) + image_token_mask = input_ids == self.image_id + images_outputs = self.image_processor(image, return_tensors="pt") + images_emb_mask = torch.ones(1, 1, self.num_image_tokens) == 1 + image_size = self.image_processor.image_size + prepare = { + "sft_format": + prompt, + "input_ids": + input_ids.reshape(1, -1), + "pixel_values": + images_outputs.pixel_values.reshape(1, -1, 3, image_size, + image_size), + "images_seq_mask": + image_token_mask.reshape(1, -1), + "images_emb_mask": + images_emb_mask, + "attention_mask": + torch.ones(1, len(input_ids)), + } + return prepare + def vllm_to_hf_output(vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], @@ -109,44 +262,13 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, return hf_output_ids, hf_output_str, out_logprobs -def get_input(tokenizer, prompt, image, dtype, device): - - image_id = 100015 - prompt = prompt[0] - image = image[0] - vl_image = VLMImageProcessor(384) - prompt = prompt.replace('', '' * 576) - input_ids = tokenizer.encode(prompt) - input_ids = torch.LongTensor(input_ids) - image_token_mask = input_ids == image_id - images_outputs = vl_image(image, return_tensors="pt") - images_emb_mask = torch.ones(1, 1, 576) == 1 - prepare = { - "sft_format": - prompt, - "input_ids": - input_ids.to(device), - "pixel_values": - images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 384, - 384).to(device), - "num_image_tokens": - 576, - "images_seq_mask": - image_token_mask.reshape(1, -1).to(device), - "images_emb_mask": - images_emb_mask.to(device), - "attention_mask": - torch.ones(1, len(input_ids)).to(device), - } - return prepare - - def run_test( hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], image_assets: _ImageAssets, model: str, *, + size_factors: List[float], dtype: str, max_tokens: int, num_logprobs: int, @@ -164,8 +286,10 @@ def run_test( """ images = [asset.pil_image for asset in image_assets] - inputs_per_image = [([prompt], [image]) - for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] # NOTE: take care of the order. run vLLM first, and then run HF. # vLLM needs a fresh new process without cuda initialization. @@ -185,56 +309,34 @@ def run_test( images=images) for prompts, images in inputs_per_image ] - print(f'vllm_outputs_per_image -> {vllm_outputs_per_image}') - AutoModelForCausalLM.register(DeepSeekMultiModalityConfig, - MultiModalityCausalLM) - tokenizer = AutoTokenizer.from_pretrained(model) - hf_model = AutoModelForCausalLM.from_pretrained(model, - trust_remote_code=True) - device = 'cuda' - dtype_dict = { - 'bfloat16': torch.bfloat16, - 'float16': torch.float16, - 'half': torch.bfloat16, - 'float32': torch.float32, - 'auto': hf_model.dtype - } - dtype = dtype_dict.get(dtype, hf_model.dtype) - hf_model = hf_model.to(dtype).to(device) - hf_outputs: List = [] - for prompts, images in inputs_per_image: - print(f'prompt: {prompts}') - print(f'images: {images}') - prepare_input = get_input(tokenizer, prompts, images, dtype, device) - attention_mask = prepare_input['attention_mask'] - inputs_embeds = hf_model.prepare_inputs_embeds(**prepare_input) - outputs = hf_model.language_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - max_new_tokens=max_tokens, - pad_token_id=tokenizer.eos_token_id, - bos_token_id=tokenizer.bos_token_id, - eos_token_id=tokenizer.eos_token_id, - do_sample=False, - use_cache=True, + # AutoModelForCausalLM.register(DeepSeekMultiModalityConfig, + # MultiModalityCausalLM) + AutoModelForVision2Seq.register(DeepSeekMultiModalityConfig, + MultiModalityCausalLM) + + with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model: + hf_model.processor = VLChatProcessor.from_pretrained(model) + + hf_outputs_per_image = [ + hf_model.generate_greedy_logprobs_limit(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + + for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, + vllm_outputs_per_image): + # TODO: Check whether using original CLIPVisionModel can improve + # consistency against HF + print(f'hf_outputs: {hf_outputs}') + print(f'vllm_outputs: {vllm_outputs}') + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", ) - for o in outputs: - hf_outputs.append((o.cpu().tolist(), - tokenizer.decode(o.cpu().tolist(), - skip_special_tokens=True))) - vllm_outputs_list = [] - for vllm_outputs in vllm_outputs_per_image: - vllm_outputs_list.append( - tuple([ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ][:2])) - print(f'hf_outputs --> {hf_outputs}') - print(f'vllm_outputs --> {vllm_outputs_list}') - check_outputs_equal(outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs_list, - name_0='hf', - name_1='vllm') print('END---->') @@ -248,7 +350,7 @@ def run_test( [1.0], # Single-scale, batched [1.0, 1.0, 1.0], - # Multi-scale + # # Multi-scale [0.25, 0.5, 1.0], ], ) @@ -262,6 +364,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, vllm_runner, image_assets, model, + size_factors=size_factors, dtype=dtype, max_tokens=max_tokens, num_logprobs=num_logprobs, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 60d5a8a20a36c..41156e5a54f09 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -6,9 +6,10 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, - DeepSeekMultiModalityConfig, - JAISConfig, MLPSpeculatorConfig, - MPTConfig, RWConfig) + JAISConfig, MedusaConfig, + MLPSpeculatorConfig, MPTConfig, + RWConfig, + DeepSeekMultiModalityConfig) if VLLM_USE_MODELSCOPE: from modelscope import AutoConfig @@ -25,6 +26,7 @@ "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) "jais": JAISConfig, "mlp_speculator": MLPSpeculatorConfig, + "medusa": MedusaConfig, "multi_modality": DeepSeekMultiModalityConfig, } From 0748ce41d090418d45e12873aff9b39475095b98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Fri, 12 Jul 2024 12:20:10 +0800 Subject: [PATCH 38/47] update test --- tests/models/test_deepseek_vl.py | 21 +++++---------------- vllm/adapter_commons/layers.py | 2 +- vllm/prompt_adapter/layers.py | 2 +- vllm/transformers_utils/config.py | 5 ++--- 4 files changed, 9 insertions(+), 21 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index d31aa0ad4e0ea..139e267b14896 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -1,22 +1,22 @@ -from typing import List, Optional, Tuple, Type from dataclasses import dataclass +from typing import List, Optional, Tuple, Type import pytest import torch -from transformers import AutoTokenizer, LlamaForCausalLM, AutoModelForVision2Seq -from transformers import LlamaTokenizerFast +from transformers import (AutoModelForVision2Seq, AutoTokenizer, + LlamaForCausalLM, LlamaTokenizerFast) from transformers.processing_utils import ProcessorMixin from vllm.model_executor.models.deepseek_vl import ( MultiModalityPreTrainedModel, VLMImageProcessor, model_name_to_cls) -from vllm.sequence import SampleLogprobs from vllm.multimodal.utils import rescale_image_size +from vllm.sequence import SampleLogprobs from vllm.transformers_utils.config import DeepSeekMultiModalityConfig from ..conftest import HfRunner, VllmRunner, _ImageAssets from .utils import check_logprobs_close -models = ["/deepseek-ai/deepseek-vl-1.3b-chat"] +models = ["deepseek-ai/deepseek-vl-1.3b-chat"] IMAGE_TOKEN_ID = 100015 pytestmark = pytest.mark.vlm @@ -47,12 +47,10 @@ def __setitem__(self, key, value): @dataclass class VLChatProcessorOutput(DictOutput): - sft_format: List[str] input_ids: torch.Tensor pixel_values: torch.Tensor attention_mask: torch.Tensor images_seq_mask: torch.BoolTensor - images_emb_mask: torch.BoolTensor def __len__(self): return len(self.input_ids) @@ -61,7 +59,6 @@ def to(self, device): self.input_ids = self.input_ids.to(device) self.attention_mask = self.attention_mask.to(device) self.images_seq_mask = self.images_seq_mask.to(device) - self.images_emb_mask = self.images_emb_mask.to(device) self.pixel_values = self.pixel_values.to(device=device) return self @@ -126,10 +123,8 @@ def prepare_inputs_embeds( def generate(self, *args, **kwargs): - sft_format = kwargs.pop('sft_format') pixel_values = kwargs.pop('pixel_values') images_seq_mask = kwargs.pop('images_seq_mask') - images_emb_mask = kwargs.pop('images_emb_mask') input_ids = kwargs.pop('input_ids') inputs_embeds = self.prepare_inputs_embeds(input_ids, pixel_values, images_seq_mask) @@ -143,7 +138,6 @@ def generate(self, *args, **kwargs): bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, **kwargs) - # output.sequences[0] = torch.concat([input_ids[0], output.sequences[0]]) return output def get_output_embeddings(self): @@ -220,11 +214,8 @@ def get_input(self, prompt, image): input_ids = torch.LongTensor(input_ids) image_token_mask = input_ids == self.image_id images_outputs = self.image_processor(image, return_tensors="pt") - images_emb_mask = torch.ones(1, 1, self.num_image_tokens) == 1 image_size = self.image_processor.image_size prepare = { - "sft_format": - prompt, "input_ids": input_ids.reshape(1, -1), "pixel_values": @@ -232,8 +223,6 @@ def get_input(self, prompt, image): image_size), "images_seq_mask": image_token_mask.reshape(1, -1), - "images_emb_mask": - images_emb_mask, "attention_mask": torch.ones(1, len(input_ids)), } diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py index 3ed60678b52f5..39ef9643fc3ec 100644 --- a/vllm/adapter_commons/layers.py +++ b/vllm/adapter_commons/layers.py @@ -11,4 +11,4 @@ class AdapterMapping: def __post_init__(self): self.index_mapping = tuple(self.index_mapping) - self.prompt_mapping = tuple(self.prompt_mapping) \ No newline at end of file + self.prompt_mapping = tuple(self.prompt_mapping) diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py index 27a61e692e1b7..8f5374bb6c92b 100644 --- a/vllm/prompt_adapter/layers.py +++ b/vllm/prompt_adapter/layers.py @@ -77,4 +77,4 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # Update hidden states hidden_states[valid_mask] = gathered_embeddings - return hidden_states \ No newline at end of file + return hidden_states diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 3bd334a14dd4a..f1f80599b2ea7 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -6,10 +6,10 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, + DeepSeekMultiModalityConfig, JAISConfig, MedusaConfig, MLPSpeculatorConfig, MPTConfig, - RWConfig, - DeepSeekMultiModalityConfig) + RWConfig) if VLLM_USE_MODELSCOPE: from modelscope import AutoConfig @@ -106,4 +106,3 @@ def try_get_generation_config( return GenerationConfig.from_model_config(config) except OSError: # Not found return None - From a063b71e787e4849d2d059c0699a7480b21863d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Mon, 15 Jul 2024 15:14:07 +0800 Subject: [PATCH 39/47] update test --- tests/entrypoints/openai/conftest.py | 2 +- tests/models/test_deepseek_vl.py | 29 ++++++++++------------------ 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/tests/entrypoints/openai/conftest.py b/tests/entrypoints/openai/conftest.py index 0837644f26bde..3c48b4273634d 100644 --- a/tests/entrypoints/openai/conftest.py +++ b/tests/entrypoints/openai/conftest.py @@ -66,4 +66,4 @@ def sample_sql_statements(): table: "table_1" | "table_2" condition: column "=" number number: "1" | "2" -""") \ No newline at end of file +""") diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 139e267b14896..8797769500e91 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -110,13 +110,8 @@ def prepare_inputs_embeds( images_embeds = images_embeds.reshape(bs, n * t, d) # [b, T, D] - input_ids[input_ids < 0] = 0 # ignore the image embeddings - inputs_embeds = self.language_model.get_input_embeddings()( - input_ids).reshape(1, -1, 2048) + inputs_embeds = self.language_model.get_input_embeddings()(input_ids) - # replace with the image embeddings - images_embeds = images_embeds.reshape( - 1, -1, self.config.aligner_config.params["n_embed"]) inputs_embeds[images_seq_mask] = images_embeds return inputs_embeds @@ -128,15 +123,13 @@ def generate(self, *args, **kwargs): input_ids = kwargs.pop('input_ids') inputs_embeds = self.prepare_inputs_embeds(input_ids, pixel_values, images_seq_mask) - tokenizer = AutoTokenizer.from_pretrained( - "/pretrained_models/deepseek-vl-1.3b-chat") output = self.language_model.generate( *args, input_ids=input_ids, inputs_embeds=inputs_embeds, - pad_token_id=tokenizer.eos_token_id, - bos_token_id=tokenizer.bos_token_id, - eos_token_id=tokenizer.eos_token_id, + pad_token_id=self.tokenizer.eos_token_id, + bos_token_id=self.tokenizer.bos_token_id, + eos_token_id=self.tokenizer.eos_token_id, **kwargs) return output @@ -206,8 +199,6 @@ def __call__(self, *arg, **kwargs): return VLChatProcessorOutput(**self.get_input(prompt, image)) def get_input(self, prompt, image): - prompt = prompt - image = image prompt = prompt.replace(self.image_tag, self.image_tag * self.num_image_tokens) input_ids = self.tokenizer.encode(prompt) @@ -279,7 +270,7 @@ def run_test( [prompt for _ in size_factors], [rescale_image_size(image, factor) for factor in size_factors], ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - + print(inputs_per_image) # NOTE: take care of the order. run vLLM first, and then run HF. # vLLM needs a fresh new process without cuda initialization. # if we run HF first, the cuda initialization will be done and it @@ -298,13 +289,13 @@ def run_test( images=images) for prompts, images in inputs_per_image ] - # AutoModelForCausalLM.register(DeepSeekMultiModalityConfig, - # MultiModalityCausalLM) + AutoModelForVision2Seq.register(DeepSeekMultiModalityConfig, MultiModalityCausalLM) with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model: hf_model.processor = VLChatProcessor.from_pretrained(model) + hf_model.model.tokenizer = AutoTokenizer.from_pretrained(model) hf_outputs_per_image = [ hf_model.generate_greedy_logprobs_limit(prompts, @@ -318,8 +309,8 @@ def run_test( vllm_outputs_per_image): # TODO: Check whether using original CLIPVisionModel can improve # consistency against HF - print(f'hf_outputs: {hf_outputs}') - print(f'vllm_outputs: {vllm_outputs}') + # print(f'hf_outputs: {hf_outputs}') + # print(f'vllm_outputs: {vllm_outputs}') check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, @@ -339,7 +330,7 @@ def run_test( [1.0], # Single-scale, batched [1.0, 1.0, 1.0], - # # Multi-scale + # Multi-scale [0.25, 0.5, 1.0], ], ) From 8be4a362acaba8fef6aad965c66357f128a059aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Mon, 15 Jul 2024 18:03:36 +0800 Subject: [PATCH 40/47] update test --- tests/models/test_deepseek_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index 8797769500e91..c8d3ae580937b 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -134,7 +134,7 @@ def generate(self, *args, **kwargs): return output def get_output_embeddings(self): - return self.language_model.get_output_embeddings() + return None def get_input_embeddings(self): return self.language_model.get_input_embeddings() From c105475934c28bd40e84a0c1cabea182dfb11e4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Tue, 16 Jul 2024 14:10:16 +0800 Subject: [PATCH 41/47] fix test failed --- tests/models/test_deepseek_vl.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index c8d3ae580937b..e85cf3b48a904 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -16,7 +16,9 @@ from ..conftest import HfRunner, VllmRunner, _ImageAssets from .utils import check_logprobs_close -models = ["deepseek-ai/deepseek-vl-1.3b-chat"] +models = [ + "deepseek-ai/deepseek-vl-1.3b-chat", "deepseek-ai/deepseek-vl-7b-chat" +] IMAGE_TOKEN_ID = 100015 pytestmark = pytest.mark.vlm @@ -78,6 +80,8 @@ def __init__(self, config: DeepSeekMultiModalityConfig): language_config = config.language_config self.language_model = LlamaForCausalLM(language_config) + # this model does not support tie_word_embeddings + setattr(self.config, 'tie_word_embeddings', False) def prepare_inputs_embeds( self, @@ -134,7 +138,7 @@ def generate(self, *args, **kwargs): return output def get_output_embeddings(self): - return None + return self.language_model.get_output_embeddings() def get_input_embeddings(self): return self.language_model.get_input_embeddings() @@ -270,7 +274,6 @@ def run_test( [prompt for _ in size_factors], [rescale_image_size(image, factor) for factor in size_factors], ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - print(inputs_per_image) # NOTE: take care of the order. run vLLM first, and then run HF. # vLLM needs a fresh new process without cuda initialization. # if we run HF first, the cuda initialization will be done and it @@ -309,8 +312,6 @@ def run_test( vllm_outputs_per_image): # TODO: Check whether using original CLIPVisionModel can improve # consistency against HF - # print(f'hf_outputs: {hf_outputs}') - # print(f'vllm_outputs: {vllm_outputs}') check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, From 5378c1041c159317071ed71582031dcd08238d70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Tue, 16 Jul 2024 14:43:58 +0800 Subject: [PATCH 42/47] fix test failed --- tests/models/test_deepseek_vl.py | 2 +- vllm/model_executor/models/deepseek_vl.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py index e85cf3b48a904..275748ed34051 100644 --- a/tests/models/test_deepseek_vl.py +++ b/tests/models/test_deepseek_vl.py @@ -81,7 +81,7 @@ def __init__(self, config: DeepSeekMultiModalityConfig): language_config = config.language_config self.language_model = LlamaForCausalLM(language_config) # this model does not support tie_word_embeddings - setattr(self.config, 'tie_word_embeddings', False) + self.config.update({'tie_word_embeddings': False}) def prepare_inputs_embeds( self, diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py index f79d8b03c1c75..6380410a4d23d 100644 --- a/vllm/model_executor/models/deepseek_vl.py +++ b/vllm/model_executor/models/deepseek_vl.py @@ -57,7 +57,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import (cached_get_tokenizer, repeat_and_pad_image_tokens) -from vllm.sequence import SamplerOutput +from vllm.sequence import IntermediateTensors, SamplerOutput from vllm.transformers_utils.configs import DeepSeekMultiModalityConfig from .clip import dummy_seq_data_for_clip @@ -2287,6 +2287,7 @@ def forward( positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, **kwargs: object, ): pixel_values = kwargs.pop("pixel_values", None) From e6d1aeb470cbc10c056b1aeba37aa7cd832f284a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Thu, 18 Jul 2024 10:26:06 +0800 Subject: [PATCH 43/47] update example --- examples/deepseek_vl_example.py | 64 +++++++++++++-------------------- 1 file changed, 25 insertions(+), 39 deletions(-) diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py index 84be2ebd5b397..72f485fc8ddaf 100644 --- a/examples/deepseek_vl_example.py +++ b/examples/deepseek_vl_example.py @@ -1,41 +1,43 @@ -import os -import subprocess - -import torch -from PIL import Image - -# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`. -# You can use `.buildkite/download-images.sh` to download them +from vllm.assets.image import ImageAsset from vllm import LLM, SamplingParams sample_params = SamplingParams(temperature=0, max_tokens=1024) model = "deepseek-ai/deepseek-vl-7b-chat" model = "deepseek-ai/deepseek-vl-1.3b-chat" -prompt = "You are a helpful language and vision assistant." \ - "You are able to understand the visual content that the user provides," \ - "and assist the user with a variety of tasks using natural language.\n" \ - "User: Describe the content of this image.\nAssistant:" +prompt_one = "You are a helpful language and vision assistant." \ + "You are able to understand the visual content that the user provides," \ + "and assist the user with a variety of tasks using natural language.\n" \ + "User: Describe the content of this image.\nAssistant:"\ -def run_deepseek_vl(): - llm = LLM(model=model, - max_model_len=3072, - enforce_eager=True, - dtype=torch.bfloat16) +prompt_two = "You are a helpful language and vision assistant. You are able to " \ + "understand the visual content that the user provides, and assist the " \ + "user with a variety of tasks using natural language.\n User: " \ + "What is the season?\nAssistant:" - image = Image.open("images/stop_sign.jpg") +def run_deepseek_vl(): + llm = LLM(model=model) + stop_sign_image = ImageAsset("stop_sign").pil_image + cherry_blossom_image = ImageAsset("cherry_blossom").pil_image outputs = llm.generate( - { - "prompt": prompt, + [{ + "prompt": prompt_one, "multi_modal_data": { - "image": image + "image": stop_sign_image }, - }, - sampling_params=sample_params) + }, { + "prompt": prompt_two, + "multi_modal_data": { + "image": cherry_blossom_image + } + }], + sampling_params=sample_params, + ) for o in outputs: generated_text = o.outputs[0].text + print("------------------") print(generated_text) @@ -44,20 +46,4 @@ def main(): if __name__ == "__main__": - # Download from s3 - s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/" - local_directory = "images" - - # Make sure the local directory exists or create it - os.makedirs(local_directory, exist_ok=True) - - # Use AWS CLI to sync the directory, assume anonymous access - subprocess.check_call([ - "aws", - "s3", - "sync", - s3_bucket_path, - local_directory, - "--no-sign-request", - ]) main() From bbef74815d9048a0e4e1f0b223b7c40ae672a006 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Thu, 18 Jul 2024 10:32:49 +0800 Subject: [PATCH 44/47] update example --- examples/deepseek_vl_example.py | 43 ++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py index 72f485fc8ddaf..4e24539d1111d 100644 --- a/examples/deepseek_vl_example.py +++ b/examples/deepseek_vl_example.py @@ -4,16 +4,18 @@ sample_params = SamplingParams(temperature=0, max_tokens=1024) model = "deepseek-ai/deepseek-vl-7b-chat" model = "deepseek-ai/deepseek-vl-1.3b-chat" -prompt_one = "You are a helpful language and vision assistant." \ - "You are able to understand the visual content that the user provides," \ - "and assist the user with a variety of tasks using natural language.\n" \ - "User: Describe the content of this image.\nAssistant:"\ +prompt_one = ( + "You are a helpful language and vision assistant." + "You are able to understand the visual content that the user provides," + "and assist the user with a variety of tasks using natural language.\n" + "User: Describe the content of this image.\nAssistant:" +) - -prompt_two = "You are a helpful language and vision assistant. You are able to " \ - "understand the visual content that the user provides, and assist the " \ - "user with a variety of tasks using natural language.\n User: " \ - "What is the season?\nAssistant:" +prompt_two = ( + "You are a helpful language and vision assistant. You are able to " + "understand the visual content that the user provides, and assist the " + "user with a variety of tasks using natural language.\n User: " + "What is the season?\nAssistant:") def run_deepseek_vl(): @@ -21,17 +23,20 @@ def run_deepseek_vl(): stop_sign_image = ImageAsset("stop_sign").pil_image cherry_blossom_image = ImageAsset("cherry_blossom").pil_image outputs = llm.generate( - [{ - "prompt": prompt_one, - "multi_modal_data": { - "image": stop_sign_image + [ + { + "prompt": prompt_one, + "multi_modal_data": { + "image": stop_sign_image + }, + }, + { + "prompt": prompt_two, + "multi_modal_data": { + "image": cherry_blossom_image + }, }, - }, { - "prompt": prompt_two, - "multi_modal_data": { - "image": cherry_blossom_image - } - }], + ], sampling_params=sample_params, ) From 3552c0321597e937be20dec89603ca90996574a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Mon, 22 Jul 2024 16:27:51 +0800 Subject: [PATCH 45/47] fix ruff error --- vllm/transformers_utils/config.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index f1f80599b2ea7..4c56704f669ce 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -5,11 +5,11 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger -from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, +from vllm.transformers_utils.configs import (ChameleonConfig, ChatGLMConfig, DeepSeekMultiModalityConfig, - JAISConfig, MedusaConfig, - MLPSpeculatorConfig, MPTConfig, - RWConfig) + DbrxConfig, JAISConfig, + MedusaConfig, MLPSpeculatorConfig, + MPTConfig, RWConfig) if VLLM_USE_MODELSCOPE: from modelscope import AutoConfig @@ -19,6 +19,7 @@ logger = init_logger(__name__) _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { + "chameleon": ChameleonConfig, "chatglm": ChatGLMConfig, "dbrx": DbrxConfig, "mpt": MPTConfig, From d615870a58deee0feb98254ce741474f78e070ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Thu, 25 Jul 2024 14:12:45 +0800 Subject: [PATCH 46/47] fix conflict --- docs/source/models/supported_models.rst | 14 +++++++++++--- vllm/transformers_utils/config.py | 9 ++++----- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 1bf6631b5a797..3d765231d1316 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -94,8 +94,8 @@ Decoder-only Language Models - :code:`ai21labs/Jamba-v0.1`, etc. - ✅︎ * - :code:`LlamaForCausalLM` - - LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi - - :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc. + - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi + - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc. - ✅︎ * - :code:`MiniCPMForCausalLM` - MiniCPM @@ -182,6 +182,10 @@ Vision Language Models - Models - Example HuggingFace Models - :ref:`LoRA ` + * - :code:`ChameleonForConditionalGeneration` + - Chameleon + - :code:`facebook/chameleon-7b` etc. + - * - :code:`FuyuForCausalLM` - Fuyu - :code:`adept/fuyu-8b` etc. @@ -202,6 +206,10 @@ Vision Language Models - Phi-3-Vision - :code:`microsoft/Phi-3-vision-128k-instruct`, etc. - + * - :code:`MiniCPM-V` + - MiniCPM-V + - :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc. + - * - :code:`DeepSeekMultiModalityCausalLM` - deepseek-ai - :code:`deepseek-ai/deepseek-vl-1.3b-chat`, :code:`deepseek-ai/deepseek-vl-7b-chat`, etc. @@ -267,4 +275,4 @@ We have the following levels of testing for models: 1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `test_models.py `_ and `test_big_models.py `_ for the models that have passed this test. 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test. 3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests `_ and `examples `_ for the models that have passed this test. -4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category. +4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category. \ No newline at end of file diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 4c56704f669ce..f1f80599b2ea7 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -5,11 +5,11 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger -from vllm.transformers_utils.configs import (ChameleonConfig, ChatGLMConfig, +from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, DeepSeekMultiModalityConfig, - DbrxConfig, JAISConfig, - MedusaConfig, MLPSpeculatorConfig, - MPTConfig, RWConfig) + JAISConfig, MedusaConfig, + MLPSpeculatorConfig, MPTConfig, + RWConfig) if VLLM_USE_MODELSCOPE: from modelscope import AutoConfig @@ -19,7 +19,6 @@ logger = init_logger(__name__) _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { - "chameleon": ChameleonConfig, "chatglm": ChatGLMConfig, "dbrx": DbrxConfig, "mpt": MPTConfig, From f48ba9b15086f3d82d371699d42482d36b019079 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?= Date: Thu, 25 Jul 2024 16:42:27 +0800 Subject: [PATCH 47/47] delete useless code --- examples/deepseek_vl_example.py | 2 +- vllm/transformers_utils/config.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py index 4e24539d1111d..38e2feb02564d 100644 --- a/examples/deepseek_vl_example.py +++ b/examples/deepseek_vl_example.py @@ -1,5 +1,5 @@ -from vllm.assets.image import ImageAsset from vllm import LLM, SamplingParams +from vllm.assets.image import ImageAsset sample_params = SamplingParams(temperature=0, max_tokens=1024) model = "deepseek-ai/deepseek-vl-7b-chat" diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index f1f80599b2ea7..652505a892142 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -6,7 +6,6 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, - DeepSeekMultiModalityConfig, JAISConfig, MedusaConfig, MLPSpeculatorConfig, MPTConfig, RWConfig) @@ -27,7 +26,6 @@ "jais": JAISConfig, "mlp_speculator": MLPSpeculatorConfig, "medusa": MedusaConfig, - "multi_modality": DeepSeekMultiModalityConfig, } for name, cls in _CONFIG_REGISTRY.items():