From 4bd5aa9d39caec988b3e3efd0fdb4c4a74aca396 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Tue, 25 Jun 2024 15:11:47 +0800
Subject: [PATCH 01/47] [Model] Initialize deepseek-vl-7b-chat support

---
 vllm/model_executor/models/__init__.py        |    1 +
 vllm/model_executor/models/deepseek_vl.py     | 1901 +++++++++++++++++
 vllm/transformers_utils/config.py             |    4 +-
 vllm/transformers_utils/configs/__init__.py   |    2 +
 .../transformers_utils/configs/deepseek_vl.py |   89 +
 5 files changed, 1996 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/models/deepseek_vl.py
 create mode 100644 vllm/transformers_utils/configs/deepseek_vl.py

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 5afb2e1d44d39..9d1da1ebc2643 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -21,6 +21,7 @@
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
+    "MultiModalityCausalLM": ("deepseek_vl", "DeepSeekMultiModalityCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py
new file mode 100644
index 0000000000000..88178d9d773a2
--- /dev/null
+++ b/vllm/model_executor/models/deepseek_vl.py
@@ -0,0 +1,1901 @@
+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import math
+import warnings
+import copy
+
+from dataclasses import dataclass
+from functools import partial
+from dataclasses import dataclass
+from functools import partial
+from typing import (
+    Callable,
+    Dict,
+    Final,
+    List,
+    Literal,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+
+import torch
+import torch.nn as nn
+import torchvision.transforms
+import torch.nn.functional as F
+import numpy as np
+import torchvision
+import torchvision.transforms.functional
+import torch.nn.functional as F
+
+
+from einops import rearrange
+from transformers import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+from einops import rearrange
+from PIL import Image
+from transformers import AutoImageProcessor, PretrainedConfig
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_utils import to_numpy_array
+from einops import rearrange
+from transformers import PreTrainedModel
+from timm.layers import (
+    AttentionPoolLatent,
+    DropPath,
+    LayerType,
+    Mlp,
+    PatchDropout,
+    PatchEmbed,
+    resample_abs_pos_embed,
+)
+from timm.models._manipulate import checkpoint_seq, named_apply
+
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama import LlamaModel
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import get_dummy_image_data
+from vllm.sequence import SamplerOutput
+from .vlm_base import VisionLanguageModelBase
+from vllm.transformers_utils.configs import DeepSeekMultiModalityConfig
+
+
+ImageType = Union[np.ndarray, torch.Tensor, Image.Image]
+IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
+IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5)
+IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)
+
+
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+
+
+class VLMImageProcessorConfig(PretrainedConfig):
+    model_type = "deepseek_vlm"
+    image_size: int
+    min_size: int
+    image_mean: Union[Tuple[float, float, float], List[float]]
+    image_std: Union[Tuple[float, float, float], List[float]]
+    rescale_factor: float
+    do_normalize: bool
+
+    def __init__(
+        self,
+        image_size: int,
+        min_size: int = 14,
+        image_mean: Union[Tuple[float, float, float], List[float]] = (
+            0.48145466,
+            0.4578275,
+            0.40821073,
+        ),
+        image_std: Union[Tuple[float, float, float], List[float]] = (
+            0.26862954,
+            0.26130258,
+            0.27577711,
+        ),
+        rescale_factor: float = 1.0 / 255.0,
+        do_normalize: bool = True,
+        **kwargs,
+    ):
+        self.image_size = image_size
+        self.min_size = min_size
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+
+        super().__init__(**kwargs)
+
+
+class VLMImageProcessor(BaseImageProcessor):
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        image_size: int,
+        min_size: int = 14,
+        image_mean: Union[Tuple[float, float, float], List[float]] = (
+            0.48145466,
+            0.4578275,
+            0.40821073,
+        ),
+        image_std: Union[Tuple[float, float, float], List[float]] = (
+            0.26862954,
+            0.26130258,
+            0.27577711,
+        ),
+        rescale_factor: float = 1.0 / 255.0,
+        do_normalize: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.min_size = min_size
+        self.do_normalize = do_normalize
+
+        if image_mean is None:
+            self.background_color = (127, 127, 127)
+        else:
+            self.background_color = tuple([int(x * 255) for x in image_mean])
+
+    def resize(self, pil_img: Image) -> np.ndarray:
+        """
+
+        Args:
+            pil_img (PIL.Image): [H, W, 3] in PIL.Image in RGB
+
+        Returns:
+            x (np.ndarray): [3, self.image_size, self.image_size]
+        """
+
+        width, height = pil_img.size
+        max_size = max(width, height)
+
+        size = [
+            max(int(height / max_size * self.image_size), self.min_size),
+            max(int(width / max_size * self.image_size), self.min_size),
+        ]
+
+        if width <= 0 or height <= 0 or size[0] <= 0 or size[1] <= 0:
+            print(f"orig size = {pil_img.size}, new size = {size}")
+            raise ValueError("Invalid size!")
+
+        pil_img = torchvision.transforms.functional.resize(
+            pil_img,
+            size,
+            interpolation=torchvision.transforms.functional.InterpolationMode.BICUBIC,
+            antialias=True,
+        )
+
+        pil_img = expand2square(pil_img, self.background_color)
+        x = to_numpy_array(pil_img)
+
+        # [H, W, 3] -> [3, H, W]
+        x = np.transpose(x, (2, 0, 1))
+
+        return x
+
+    def preprocess(self, images, return_tensors: str = "pt", **kwargs) -> BatchFeature:
+        # resize and pad to [self.image_size, self.image_size]
+        # then convert from [H, W, 3] to [3, H, W]
+        # print(images)
+        if not isinstance(images, List):
+            images = [
+                images,
+            ]
+        images: List[np.ndarray] = [self.resize(image) for image in images]
+
+        # resacle from [0, 255] -> [0, 1]
+        images = [
+            self.rescale(
+                image=image,
+                scale=self.rescale_factor,
+                input_data_format="channels_first",
+            )
+            for image in images
+        ]
+
+        # normalize
+        if self.do_normalize:
+            images = [
+                self.normalize(
+                    image=image,
+                    mean=self.image_mean,
+                    std=self.image_std,
+                    input_data_format="channels_first",
+                )
+                for image in images
+            ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    @property
+    def default_shape(self):
+        return [3, self.image_size, self.image_size]
+
+
+class MlpProjector(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.cfg = cfg
+
+        if cfg.projector_type == "identity":
+            modules = nn.Identity()
+
+        elif cfg.projector_type == "linear":
+            modules = nn.Linear(cfg.input_dim, cfg.n_embed)
+
+        elif cfg.projector_type == "mlp_gelu":
+            mlp_depth = cfg.get("depth", 1)
+            modules = [nn.Linear(cfg.input_dim, cfg.n_embed)]
+            for _ in range(1, mlp_depth):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(cfg.n_embed, cfg.n_embed))
+            modules = nn.Sequential(*modules)
+
+        elif cfg.projector_type == "low_high_hybrid_split_mlp_gelu":
+            mlp_depth = cfg.get("depth", 1)
+            self.high_up_proj = nn.Linear(cfg.input_dim, cfg.n_embed // 2)
+            self.low_up_proj = nn.Linear(cfg.input_dim, cfg.n_embed // 2)
+
+            modules = []
+            for _ in range(1, mlp_depth):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(cfg.n_embed, cfg.n_embed))
+            modules = nn.Sequential(*modules)
+
+        else:
+            raise ValueError(f"Unknown projector type: {cfg.projector_type}")
+
+        self.layers = modules
+
+    def forward(
+        self, x_or_tuple: Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]
+    ):
+        """
+
+        Args:
+            x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:  if it is a tuple of torch.Tensor,
+                then it comes from the hybrid vision encoder, and x = high_res_x, low_res_x);
+                otherwise it is the feature from the single vision encoder.
+
+        Returns:
+            x (torch.Tensor): [b, s, c]
+        """
+
+        if isinstance(x_or_tuple, tuple):
+            # self.cfg.projector_type == "low_high_hybrid_split_mlp_gelu":
+            high_x, low_x = x_or_tuple
+            high_x = self.high_up_proj(high_x)
+            low_x = self.low_up_proj(low_x)
+            x = torch.concat([high_x, low_x], dim=-1)
+        else:
+            x = x_or_tuple
+
+        return self.layers(x)
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)  # noqa: E741
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.0))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    # type: (torch.Tensor, float, float, float, float) -> torch.Tensor
+    r"""The original timm.models.layers.weight_init.trunc_normal_ can not handle bfloat16 yet, here we first
+    convert the tensor to float32, apply the trunc_normal_() in float32, and then convert it back to its orignal dtype.
+    Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn
+    from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+
+    with torch.no_grad():
+        dtype = tensor.dtype
+        tensor_fp32 = tensor.float()
+        tensor_fp32 = _no_grad_trunc_normal_(tensor_fp32, mean, std, a, b)
+        tensor_dtype = tensor_fp32.to(dtype=dtype)
+        tensor.copy_(tensor_dtype)
+
+
+def init_weights(self):
+    if self.pos_embed is not None:
+        trunc_normal_(self.pos_embed, std=self.pos_embed.shape[1] ** -0.5)
+    trunc_normal_(self.latent, std=self.latent_dim**-0.5)
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = "") -> None:
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif hasattr(module, "init_weights"):
+        module.init_weights()
+
+
+class SigLipAttention(nn.Module):
+    fused_attn: Final[bool]
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        # self.fused_attn = use_fused_attn()
+        self.fused_attn = True
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop) if proj_drop > 0.0 else nn.Identity()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, self.head_dim)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                dropout_p=self.attn_drop.p if self.training else 0.0,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: float = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+
+
+class SigLipBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: nn.Module = nn.GELU,
+        norm_layer: nn.Module = nn.LayerNorm,
+        mlp_layer: nn.Module = Mlp,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = SigLipAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            norm_layer=norm_layer,
+        )
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = mlp_layer(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=proj_drop,
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+
+
+class VisionTransformer(nn.Module):
+    """Vision Transformer
+
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    """
+
+    dynamic_img_size: Final[bool]
+
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        num_classes: int = 1000,
+        global_pool: Literal["", "avg", "token", "map"] = "token",
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        init_values: Optional[float] = None,
+        class_token: bool = True,
+        no_embed_class: bool = False,
+        reg_tokens: int = 0,
+        pre_norm: bool = False,
+        fc_norm: Optional[bool] = None,
+        dynamic_img_size: bool = False,
+        dynamic_img_pad: bool = False,
+        drop_rate: float = 0.0,
+        pos_drop_rate: float = 0.0,
+        patch_drop_rate: float = 0.0,
+        proj_drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
+        weight_init: Literal["skip", "jax", "jax_nlhb", "moco", ""] = "",
+        embed_layer: Callable = PatchEmbed,
+        norm_layer: Optional[LayerType] = None,
+        act_layer: Optional[LayerType] = None,
+        block_fn: Type[nn.Module] = SigLipBlock,
+        mlp_layer: Type[nn.Module] = Mlp,
+        ignore_head: bool = False,
+    ) -> None:
+        """
+        Args:
+            img_size: Input image size.
+            patch_size: Patch size.
+            in_chans: Number of image input channels.
+            num_classes: Mumber of classes for classification head.
+            global_pool: Type of global pooling for final sequence (default: 'token').
+            embed_dim: Transformer embedding dimension.
+            depth: Depth of transformer.
+            num_heads: Number of attention heads.
+            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
+            qkv_bias: Enable bias for qkv projections if True.
+            init_values: Layer-scale init values (layer-scale enabled if not None).
+            class_token: Use class token.
+            no_embed_class: Don't include position embeddings for class (or reg) tokens.
+            reg_tokens: Number of register tokens.
+            fc_norm: Pre head norm after pool (instead of before), if None, enabled when global_pool == 'avg'.
+            drop_rate: Head dropout rate.
+            pos_drop_rate: Position embedding dropout rate.
+            attn_drop_rate: Attention dropout rate.
+            drop_path_rate: Stochastic depth rate.
+            weight_init: Weight initialization scheme.
+            embed_layer: Patch embedding layer.
+            norm_layer: Normalization layer.
+            act_layer: MLP activation layer.
+            block_fn: Transformer block layer.
+        """
+        super().__init__()
+        assert global_pool in ("", "avg", "token", "map")
+        assert class_token or global_pool != "token"
+        use_fc_norm = global_pool == "avg" if fc_norm is None else fc_norm
+        # norm_layer = get_norm_layer(norm_layer) or partial(nn.LayerNorm, eps=1e-6)
+        # act_layer = get_act_layer(act_layer) or nn.GELU
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        act_layer = nn.GELU
+
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = self.embed_dim = (
+            embed_dim  # num_features for consistency with other models
+        )
+        self.num_prefix_tokens = 1 if class_token else 0
+        self.num_prefix_tokens += reg_tokens
+        self.num_reg_tokens = reg_tokens
+        self.has_class_token = class_token
+        self.no_embed_class = (
+            no_embed_class  # don't embed prefix positions (includes reg)
+        )
+        self.dynamic_img_size = dynamic_img_size
+        self.grad_checkpointing = False
+        self.ignore_head = ignore_head
+
+        embed_args = {}
+        if dynamic_img_size:
+            # flatten deferred until after pos embed
+            embed_args.update(dict(strict_img_size=False, output_fmt="NHWC"))
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            bias=not pre_norm,  # disable bias if pre-norm is used (e.g. CLIP)
+            dynamic_img_pad=dynamic_img_pad,
+            **embed_args,
+        )
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = (
+            nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None
+        )
+        self.reg_token = (
+            nn.Parameter(torch.zeros(1, reg_tokens, embed_dim)) if reg_tokens else None
+        )
+        embed_len = (
+            num_patches if no_embed_class else num_patches + self.num_prefix_tokens
+        )
+        self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02)
+        self.pos_drop = nn.Dropout(p=pos_drop_rate)
+        if patch_drop_rate > 0:
+            self.patch_drop = PatchDropout(
+                patch_drop_rate,
+                num_prefix_tokens=self.num_prefix_tokens,
+            )
+        else:
+            self.patch_drop = nn.Identity()
+        self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity()
+
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(
+            *[
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_norm=qk_norm,
+                    init_values=init_values,
+                    proj_drop=proj_drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    act_layer=act_layer,
+                    mlp_layer=mlp_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity()
+
+        # Classifier Head
+        if global_pool == "map":
+            AttentionPoolLatent.init_weights = init_weights
+            self.attn_pool = AttentionPoolLatent(
+                self.embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                norm_layer=norm_layer,
+            )
+        else:
+            self.attn_pool = None
+        self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity()
+        self.head_drop = nn.Dropout(drop_rate)
+        self.head = (
+            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
+
+        if weight_init != "skip":
+            self.init_weights(weight_init)
+
+    def init_weights(self, mode: Literal["jax", "jax_nlhb", "moco", ""] = "") -> None:
+        assert mode in ("jax", "jax_nlhb", "moco", "")
+        # head_bias = -math.log(self.num_classes) if "nlhb" in mode else 0.0
+        trunc_normal_(self.pos_embed, std=0.02)
+        if self.cls_token is not None:
+            nn.init.normal_(self.cls_token, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+
+    @torch.jit.ignore
+    def no_weight_decay(self) -> Set:
+        return {"pos_embed", "cls_token", "dist_token"}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse: bool = False) -> Dict:
+        return dict(
+            stem=r"^cls_token|pos_embed|patch_embed",  # stem and embed
+            blocks=[(r"^blocks\.(\d+)", None), (r"^norm", (99999,))],
+        )
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable: bool = True) -> None:
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def get_classifier(self) -> nn.Module:
+        return self.head
+
+    def reset_classifier(self, num_classes: int, global_pool=None) -> None:
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ("", "avg", "token", "map")
+            if global_pool == "map" and self.attn_pool is None:
+                assert (
+                    False
+                ), "Cannot currently add attention pooling in reset_classifier()."
+            elif global_pool != "map " and self.attn_pool is not None:
+                self.attn_pool = None  # remove attention pooling
+            self.global_pool = global_pool
+        self.head = (
+            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
+
+    def _pos_embed(self, x: torch.Tensor) -> torch.Tensor:
+        if self.dynamic_img_size:
+            B, H, W, C = x.shape
+            pos_embed = resample_abs_pos_embed(
+                self.pos_embed,
+                (H, W),
+                num_prefix_tokens=0 if self.no_embed_class else self.num_prefix_tokens,
+            )
+            x = x.view(B, -1, C)
+        else:
+            pos_embed = self.pos_embed
+
+        to_cat = []
+        if self.cls_token is not None:
+            to_cat.append(self.cls_token.expand(x.shape[0], -1, -1))
+        if self.reg_token is not None:
+            to_cat.append(self.reg_token.expand(x.shape[0], -1, -1))
+
+        if self.no_embed_class:
+            # deit-3, updated JAX (big vision)
+            # position embedding does not overlap with class token, add then concat
+            x = x + pos_embed
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+        else:
+            # original timm, JAX, and deit vit impl
+            # pos_embed has entry for class token, concat then add
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+            x = x + pos_embed
+
+        return self.pos_drop(x)
+
+    def _intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,
+    ) -> List[torch.Tensor]:
+        outputs, num_blocks = [], len(self.blocks)
+        take_indices = set(
+            range(num_blocks - n, num_blocks) if isinstance(n, int) else n
+        )
+
+        # forward pass
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.patch_drop(x)
+        x = self.norm_pre(x)
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in take_indices:
+                outputs.append(x)
+
+        return outputs
+
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,
+        reshape: bool = False,
+        return_prefix_tokens: bool = False,
+        norm: bool = False,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        """Intermediate layer accessor (NOTE: This is a WIP experiment).
+        Inspired by DINO / DINOv2 interface
+        """
+        # take last n blocks if n is an int, if in is a sequence, select by matching indices
+        outputs = self._intermediate_layers(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        prefix_tokens = [out[:, 0 : self.num_prefix_tokens] for out in outputs]
+        outputs = [out[:, self.num_prefix_tokens :] for out in outputs]
+
+        if reshape:
+            grid_size = self.patch_embed.grid_size
+            outputs = [
+                out.reshape(x.shape[0], grid_size[0], grid_size[1], -1)
+                .permute(0, 3, 1, 2)
+                .contiguous()
+                for out in outputs
+            ]
+
+        if return_prefix_tokens:
+            return tuple(zip(outputs, prefix_tokens))
+        return tuple(outputs)
+
+    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.patch_drop(x)
+        x = self.norm_pre(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x: torch.Tensor, pre_logits: bool = False) -> torch.Tensor:
+        if self.attn_pool is not None:
+            x = self.attn_pool(x)
+        elif self.global_pool == "avg":
+            x = x[:, self.num_prefix_tokens :].mean(dim=1)
+        elif self.global_pool:
+            x = x[:, 0]  # class token
+        x = self.fc_norm(x)
+        x = self.head_drop(x)
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.forward_features(x)
+        if not self.ignore_head:
+            x = self.forward_head(x)
+        return x
+
+
+@dataclass
+class SigLIPVisionCfg:
+    width: int = 1152
+    layers: Union[Tuple[int, int, int, int], int] = 27
+    heads: int = 16
+    patch_size: int = 14
+    image_size: Union[Tuple[int, int], int] = 336
+    global_pool: str = "map"
+    mlp_ratio: float = 3.7362
+    class_token: bool = False
+    num_classes: int = 0
+    use_checkpoint: bool = False
+
+
+SigLIP_MODEL_CONFIG = {
+    "siglip_so400m_patch14_384": {
+        "image_size": 336,
+        "patch_size": 14,
+        "width": 1152,
+        "layers": 27,
+        "heads": 16,
+        "mlp_ratio": 3.7362,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+    "siglip_so400m_patch14_224": {
+        "image_size": 224,
+        "patch_size": 14,
+        "width": 1152,
+        "layers": 27,
+        "heads": 16,
+        "mlp_ratio": 3.7362,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+    "siglip_large_patch16_384": {
+        "image_size": 384,
+        "patch_size": 16,
+        "width": 1024,
+        "layers": 24,
+        "heads": 16,
+        "mlp_ratio": 4,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+}
+
+
+def create_siglip_vit(
+    model_name: str = "siglip_so400m_patch14_384",
+    image_size: int = 384,
+    select_layer: int = -1,
+    ckpt_path: str = "",
+    **kwargs,
+):
+    assert (
+        model_name in SigLIP_MODEL_CONFIG.keys()
+    ), f"model name should be in {SigLIP_MODEL_CONFIG.keys()}"
+
+    vision_cfg = SigLIPVisionCfg(**SigLIP_MODEL_CONFIG[model_name])
+
+    if select_layer <= 0:
+        layers = min(vision_cfg.layers, vision_cfg.layers + select_layer + 1)
+    else:
+        layers = min(vision_cfg.layers, select_layer)
+
+    model = VisionTransformer(
+        img_size=image_size,
+        patch_size=vision_cfg.patch_size,
+        embed_dim=vision_cfg.width,
+        depth=layers,
+        num_heads=vision_cfg.heads,
+        mlp_ratio=vision_cfg.mlp_ratio,
+        class_token=vision_cfg.class_token,
+        global_pool=vision_cfg.global_pool,
+        ignore_head=kwargs.get("ignore_head", True),
+        weight_init=kwargs.get("weight_init", "skip"),
+        num_classes=0,
+    )
+
+    if ckpt_path:
+        state_dict = torch.load(ckpt_path, map_location="cpu")
+
+        incompatible_keys = model.load_state_dict(state_dict, strict=False)
+        print(
+            f"SigLIP-ViT restores from {ckpt_path},\n"
+            f"\tincompatible_keys:', {incompatible_keys}."
+        )
+
+    return model
+
+
+class MLPBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        mlp_dim: int,
+        act: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+        self.act = act()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.lin2(self.act(self.lin1(x)))
+
+
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
+class ImageEncoderViT(nn.Module):
+    def __init__(
+        self,
+        img_size: int = 1024,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        out_chans: int = 256,
+        qkv_bias: bool = True,
+        norm_layer: Type[nn.Module] = nn.LayerNorm,
+        act_layer: Type[nn.Module] = nn.GELU,
+        use_abs_pos: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        global_attn_indexes: Tuple[int, ...] = (),
+        downsample_channels: Tuple[int, ...] = (512, 1024),
+    ) -> None:
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            global_attn_indexes (list): Indexes for blocks using global attention.
+            downsample_channels (list): Channels for downsampling layers.
+        """
+        super().__init__()
+        self.img_size = img_size
+
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+
+        self.pos_embed: Optional[nn.Parameter] = None
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = nn.Parameter(
+                torch.zeros(
+                    1, img_size // patch_size, img_size // patch_size, embed_dim
+                )
+            )
+
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i not in global_attn_indexes else 0,
+                input_size=(img_size // patch_size, img_size // patch_size),
+            )
+            self.blocks.append(block)
+
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                embed_dim,
+                out_chans,
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+            nn.Conv2d(
+                out_chans,
+                out_chans,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+        )
+
+        in_channels = out_chans
+        downsamples = []
+        for i in range(len(downsample_channels)):
+            out_channels = downsample_channels[i]
+            downsamples.append(
+                nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=False,
+                )
+            )
+            in_channels = out_channels
+        self.downsamples = nn.Sequential(*downsamples)
+
+        self.sam_hd = True
+        if self.sam_hd:
+            self.hd_alpha_downsamples = nn.Parameter(torch.zeros(1))
+            # self.neck_hd = nn.Linear(embed_dim, embed_dim)
+            self.neck_hd = copy.deepcopy(self.neck)
+            # self.downsamples_hd = copy.deepcopy(self.downsamples)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+
+        global_features = []
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if self.sam_hd and blk.window_size == 0:
+                global_features.append(x)
+
+        x = self.neck(x.permute(0, 3, 1, 2))
+        x_dtype = x.dtype
+        x = F.interpolate(
+            x.float(), size=(96, 96), mode="bilinear", align_corners=False
+        ).to(x_dtype)
+        x = self.downsamples(x)
+
+        if self.sam_hd:
+            first_global_feature = self.neck_hd(global_features[0].permute(0, 3, 1, 2))
+            x_dtype = first_global_feature.dtype
+            first_global_feature = F.interpolate(
+                first_global_feature.float(),
+                size=(96, 96),
+                mode="bilinear",
+                align_corners=False,
+            )
+            first_global_feature = self.downsamples(first_global_feature.to(x_dtype))
+            x = x + first_global_feature * self.hd_alpha_downsamples
+
+        return x
+
+
+class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        norm_layer: Type[nn.Module] = nn.LayerNorm,
+        act_layer: Type[nn.Module] = nn.GELU,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        input_size: Optional[Tuple[int, int]] = None,
+    ) -> None:
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then
+                use global attention.
+            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+                positional parameter size.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size if window_size == 0 else (window_size, window_size),
+        )
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = MLPBlock(
+            embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer
+        )
+
+        self.window_size = window_size
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+
+        x = shortcut + x
+        x = x + self.mlp(self.norm2(x))
+
+        return x
+
+
+class Attention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        input_size: Optional[Tuple[int, int]] = None,
+    ) -> None:
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool):  If True, add a learnable bias to query, key, value.
+            rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+                positional parameter size.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            assert (
+                input_size is not None
+            ), "Input size must be provided if using relative positional encoding."
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, H, W, _ = x.shape
+        # qkv with shape (3, B, nHead, H * W, C)
+        qkv = (
+            self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        )
+        # q, k, v with shape (B * nHead, H * W, C)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
+
+        def do_attention(q, k, v):
+            attn = (q * self.scale) @ k.transpose(-2, -1)
+            if self.use_rel_pos:
+                attn = add_decomposed_rel_pos(
+                    attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)
+                )
+
+            attn = attn.softmax(dim=-1)
+            x = (
+                (attn @ v)
+                .view(B, self.num_heads, H, W, -1)
+                .permute(0, 2, 3, 1, 4)
+                .reshape(B, H, W, -1)
+            )
+
+            return x
+
+        # from haiscale.utils import on_demand_checkpoint
+        # x = on_demand_checkpoint(do_attention, q, k, v)
+        x = do_attention(q, k, v)
+        x = self.proj(x)
+
+        return x
+
+
+def window_partition(
+    x: torch.Tensor, window_size: int
+) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = (
+        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    )
+    return windows, (Hp, Wp)
+
+
+def window_unpartition(
+    windows: torch.Tensor,
+    window_size: int,
+    pad_hw: Tuple[int, int],
+    hw: Tuple[int, int],
+) -> torch.Tensor:
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(
+        B, Hp // window_size, Wp // window_size, window_size, window_size, -1
+    )
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+
+
+def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+    return rel_pos_resized[relative_coords.long()]
+
+
+def add_decomposed_rel_pos(
+    attn: torch.Tensor,
+    q: torch.Tensor,
+    rel_pos_h: torch.Tensor,
+    rel_pos_w: torch.Tensor,
+    q_size: Tuple[int, int],
+    k_size: Tuple[int, int],
+) -> torch.Tensor:
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+
+    attn = (
+        attn.view(B, q_h, q_w, k_h, k_w)
+        + rel_h[:, :, :, :, None]
+        + rel_w[:, :, :, None, :]
+    ).view(B, q_h * q_w, k_h * k_w)
+
+    return attn
+
+
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+
+    def __init__(
+        self,
+        kernel_size: Tuple[int, int] = (16, 16),
+        stride: Tuple[int, int] = (16, 16),
+        padding: Tuple[int, int] = (0, 0),
+        in_chans: int = 3,
+        embed_dim: int = 768,
+    ) -> None:
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x
+
+
+@dataclass
+class SAMViTCfg:
+    image_size: Union[Tuple[int, int], int] = 1024
+    width: int = 1024
+    layers: int = 23
+    heads: int = 16
+    patch_size: int = 16
+    window_size: int = 14
+    prompt_embed_dim: int = 256
+    global_attn_indexes: Union[List[int], Tuple[int]] = (5, 11, 17, 23)
+    downsample_channels: Union[List[int], Tuple[int]] = (512, 1024)
+
+
+SAM_MODEL_CONFIG = {
+    "sam_vit_b": {
+        "width": 768,
+        "layers": 12,
+        "heads": 12,
+        "global_attn_indexes": [2, 5, 8, 11],
+        "downsample_channels": (),
+    },
+    "sam_b_downsample": {
+        "width": 768,
+        "layers": 12,
+        "heads": 12,
+        "global_attn_indexes": [2, 5, 8, 11],
+        "downsample_channels": (512, 1024),
+    },
+    "sam_vit_l": {
+        "width": 1024,
+        "layers": 24,
+        "heads": 16,
+        "global_attn_indexes": [5, 11, 17, 23],
+        "downsample_channels": (),
+    },
+    "sam_vit_h": {
+        "width": 1280,
+        "layers": 32,
+        "heads": 16,
+        "global_attn_indexes": [7, 15, 23, 31],
+        "downsample_channels": (),
+    },
+}
+
+
+def create_sam_vit(
+    model_name: str = "sam_b_downsample",
+    image_size: int = 1024,
+    ckpt_path: str = "",
+    **kwargs,
+):
+    assert (
+        model_name in SAM_MODEL_CONFIG.keys()
+    ), f"model name: {model_name} should be in {SAM_MODEL_CONFIG.keys()}"
+
+    sam_cfg = SAMViTCfg(**SAM_MODEL_CONFIG[model_name])
+    image_encoder = ImageEncoderViT(
+        depth=sam_cfg.layers,
+        embed_dim=sam_cfg.width,
+        img_size=image_size,
+        mlp_ratio=4,
+        norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
+        num_heads=sam_cfg.heads,
+        patch_size=sam_cfg.patch_size,
+        qkv_bias=True,
+        use_rel_pos=True,
+        global_attn_indexes=sam_cfg.global_attn_indexes,
+        window_size=14,
+        out_chans=sam_cfg.prompt_embed_dim,
+        downsample_channels=sam_cfg.downsample_channels,
+    )
+
+    if ckpt_path:
+        state_dict = torch.load(ckpt_path)
+        image_encoder.load_state_dict(state_dict, strict=False)
+        print(f"SAM-ViT restores from {ckpt_path}")
+
+    return image_encoder
+
+
+class CLIPVisionTower(nn.Module):
+    def __init__(
+        self,
+        model_name: str = "siglip_large_patch16_384",
+        image_size: Union[Tuple[int, int], int] = 336,
+        select_feature: str = "patch",
+        select_layer: int = -2,
+        select_layers: list = None,
+        ckpt_path: str = "",
+        pixel_mean: Optional[List[float]] = None,
+        pixel_std: Optional[List[float]] = None,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.model_name = model_name
+        self.select_feature = select_feature
+        self.select_layer = select_layer
+        self.select_layers = select_layers
+
+        vision_tower_params = {
+            "model_name": model_name,
+            "image_size": image_size,
+            "ckpt_path": ckpt_path,
+            "select_layer": select_layer,
+        }
+        vision_tower_params.update(kwargs)
+        self.vision_tower, self.forward_kwargs = self.build_vision_tower(
+            vision_tower_params
+        )
+
+        if pixel_mean is not None and pixel_std is not None:
+            image_norm = torchvision.transforms.Normalize(
+                mean=pixel_mean, std=pixel_std
+            )
+        else:
+            image_norm = None
+
+        self.image_norm = image_norm
+
+    def build_vision_tower(self, vision_tower_params):
+        if self.model_name.startswith("siglip"):
+            self.select_feature = "same"
+            vision_tower = create_siglip_vit(**vision_tower_params)
+            forward_kwargs = dict()
+
+        elif self.model_name.startswith("sam"):
+            vision_tower = create_sam_vit(**vision_tower_params)
+            forward_kwargs = dict()
+
+        else:  # huggingface
+            from transformers import CLIPVisionModel
+
+            vision_tower = CLIPVisionModel.from_pretrained(**vision_tower_params)
+            forward_kwargs = dict(output_hidden_states=True)
+
+        return vision_tower, forward_kwargs
+
+    def feature_select(self, image_forward_outs):
+        if isinstance(image_forward_outs, torch.Tensor):
+            # the output has been the self.select_layer"s features
+            image_features = image_forward_outs
+        else:
+            image_features = image_forward_outs.hidden_states[self.select_layer]
+
+        if self.select_feature == "patch":
+            # if the output has cls_token
+            image_features = image_features[:, 1:]
+        elif self.select_feature == "cls_patch":
+            image_features = image_features
+        elif self.select_feature == "same":
+            image_features = image_features
+
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        return image_features
+
+    def forward(self, images):
+        """
+
+        Args:
+            images (torch.Tensor): [b, 3, H, W]
+
+        Returns:
+            image_features (torch.Tensor): [b, n_patch, d]
+        """
+
+        if self.image_norm is not None:
+            images = self.image_norm(images)
+
+        image_forward_outs = self.vision_tower(images, **self.forward_kwargs)
+        image_features = self.feature_select(image_forward_outs)
+        return image_features
+
+
+class HybridVisionTower(nn.Module):
+    def __init__(
+        self,
+        high_res_cfg: Dict,
+        low_res_cfg: Dict,
+        freeze_high: bool = False,
+        freeze_low: bool = False,
+        concat_type: Literal["feature", "sequence", "add", "tuple"] = "tuple",
+        **ignore_kwargs,
+    ):
+        super().__init__()
+
+        self.vision_tower_high = CLIPVisionTower(**high_res_cfg)
+        self.vision_tower_low = CLIPVisionTower(**low_res_cfg)
+        self.low_res_size = low_res_cfg["image_size"]
+        self.concat_type = concat_type
+
+        self.high_layer_norm = nn.LayerNorm(high_res_cfg.get("output_dim", 1024))
+        self.low_layer_norm = nn.LayerNorm(low_res_cfg.get("output_dim", 1024))
+
+        if freeze_high:
+            for p_name, p in self.vision_tower_high.named_parameters():
+                p.requires_grad = False
+            self.vision_tower_high = self.vision_tower_high.eval()
+        else:
+            # train donwsamples and neck
+            for p_name, p in self.vision_tower_high.named_parameters():
+                if "downsamples" in p_name or "neck" in p_name:
+                    p.requires_grad = True
+                else:
+                    p.requires_grad = False
+
+        if freeze_low:
+            for p in self.vision_tower_low.parameters():
+                p.requires_grad = False
+            self.vision_tower_low = self.vision_tower_low.eval()
+
+        self.resize = torchvision.transforms.Resize(self.low_res_size, antialias=True)
+
+    def forward(self, images: torch.Tensor):
+        """
+
+        Args:
+            images (torch.Tensor): [bs, 3, H, W]
+
+        Returns:
+            res (torch.Tensor): [bs, t, c]
+        """
+
+        # [bs, c, h, w]
+        high_images = images
+
+        # [bs, c, h_low, w_low]
+        low_images = self.resize(images)
+
+        # separately run two vision towers
+        # run high_res vision tower
+        high_res = self.vision_tower_high(high_images)
+        # [bs, c, h, w] -> [bs, h*w, c]
+        high_res = rearrange(high_res, "b c h w -> b (h w) c")
+        # run low_res vision tower
+        low_res = self.vision_tower_low(low_images)
+
+        if self.concat_type == "feature":
+            images_features = torch.cat([high_res, low_res], dim=-1)
+        elif self.concat_type == "sequence":
+            images_features = torch.cat([high_res, low_res], dim=1)
+        elif self.concat_type == "add":
+            images_features = high_res + low_res
+        elif self.concat_type == "tuple":
+            images_features = (high_res, low_res)
+
+        else:
+            raise ValueError(
+                "Currently only support `feature`, `sequence`, `add` and `tuple` concat type."
+            )
+
+        return images_features
+
+
+def model_name_to_cls(cls_name):
+    if "MlpProjector" in cls_name:
+        cls = MlpProjector
+
+    elif "CLIPVisionTower" in cls_name:
+        cls = CLIPVisionTower
+
+    elif "HybridVisionTower" in cls_name:
+        cls = HybridVisionTower
+
+    else:
+        raise ValueError(f"class_name {cls_name} is invalid.")
+
+    return cls
+
+
+class MultiModalityPreTrainedModel(PreTrainedModel):
+    config_class = DeepSeekMultiModalityConfig
+    base_model_prefix = "multi_modality"
+    _no_split_modules = []
+    _skip_keys_device_placement = "past_key_values"
+
+
+@MULTIMODAL_REGISTRY.register_image_feature_input()
+@MULTIMODAL_REGISTRY.register_image_pixel_input()
+@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
+class DeepSeekMultiModalityCausalLM(VisionLanguageModelBase):
+
+    def __init__(
+        self,
+        config: DeepSeekMultiModalityConfig,
+        vision_language_config: VisionLanguageConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__(
+            config,
+        )
+        self.config = config
+        vision_config = config.vision_config
+        self.image_size = vision_config.params.high_res_cfg.image_size
+        vision_cls = model_name_to_cls(vision_config.cls)
+        self.vision_model = vision_cls(**vision_config.params)
+        self.vision_tower = self.vision_model
+        aligner_config = config.aligner_config
+        aligner_cls = model_name_to_cls(aligner_config.cls)
+        self.aligner = aligner_cls(aligner_config.params)
+
+        language_config = config.language_config
+        self.language_model = LlamaModel(language_config)
+        self.image_processor = VLMImageProcessor(self.image_size)
+        self.logits_processor = LogitsProcessor(language_config.vocab_size)
+        self.sampler = Sampler()
+        self.lm_head = ParallelLMHead(
+            language_config.vocab_size, language_config.hidden_size
+        )
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def prepare_inputs_embeds(
+        self,
+        input_ids: torch.LongTensor,
+        pixel_values: torch.FloatTensor,
+        images_seq_mask: torch.LongTensor,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            input_ids (torch.LongTensor): [b, T]
+            pixel_values (torch.FloatTensor):   [b, n_images, 3, h, w]
+            images_seq_mask (torch.BoolTensor): [b, T]
+
+            assert torch.sum(images_seq_mask) == torch.sum(images_emb_mask)
+
+        Returns:
+            input_embeds (torch.Tensor): [b, T, D]
+        """
+
+        bs, n = pixel_values.shape[0:2]
+        images = rearrange(pixel_values, "b n c h w -> (b n) c h w")
+        # [b x n, T2, D]
+        images = images.to(self.vision_model.high_layer_norm.weight.dtype).to(
+            self.vision_model.high_layer_norm.weight.device
+        )
+        images_embeds = self.aligner(self.vision_model(images))
+
+        # [b x n, T2, D] -> [b, n x T2, D]
+        images_embeds = rearrange(images_embeds, "(b n) t d -> b (n t) d", b=bs, n=n)
+
+        # [b, T, D]
+        input_ids[input_ids < 0] = 0  # ignore the image embeddings
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids=input_ids)
+
+        # replace with the image embeddings
+        images_embeds = images_embeds.reshape(
+            -1, self.config.aligner_config.params.n_embed
+        )
+        inputs_embeds[images_seq_mask] = images_embeds
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        **kwargs: object,
+    ):
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_features = kwargs.pop("image_features", None)
+        if image_features and not pixel_values:
+            pixel_values = image_features
+        if pixel_values is not None:
+            image_token_id = 100015
+            image_token_mask = input_ids == image_token_id
+            inputs_embeds = self.prepare_inputs_embeds(
+                input_ids,
+                pixel_values.reshape(1, -1, 3, self.image_size, self.image_size),
+                image_token_mask,
+            )
+
+            input_ids = None
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(
+            input_ids, positions, kv_caches, attn_metadata, inputs_embeds=inputs_embeds
+        )
+
+        return hidden_states
+
+    def compute_logits(
+        self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata
+    ) -> torch.Tensor:
+        logits = self.logits_processor(
+            self.lm_head.weight, hidden_states, sampling_metadata
+        )
+        return logits
+
+    def load_weights(self, weights):
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "lm" in name:
+                self.lm_head.weight_loader(self.lm_head.weight, loaded_weight)
+                continue
+            if name.startswith("language_model"):
+                name = name.replace("language_model.model.", "language_model.", 1)
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "language_model" not in name:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                # if name not in params_dict:
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (
+                    "mlp.experts." in name or "mlp.shared_experts." in name
+                ) and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                # if name.endswith(".bias") and name not in params_dict:
+                #     continue
+                # Skip experts that are not assigned to this worker.
+                if (
+                    "mlp.experts." in name or "mlp.shared_experts." in name
+                ) and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+AutoImageProcessor.register(VLMImageProcessorConfig, VLMImageProcessor)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 60fc756a12e3d..1fd4e174177c8 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -7,7 +7,8 @@
 from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
                                              JAISConfig, MLPSpeculatorConfig,
-                                             MPTConfig, RWConfig)
+                                             MPTConfig, RWConfig,
+                                             DeepSeekMultiModalityConfig)
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -24,6 +25,7 @@
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
     "jais": JAISConfig,
     "mlp_speculator": MLPSpeculatorConfig,
+    "multi_modality": DeepSeekMultiModalityConfig,
 }
 
 for name, cls in _CONFIG_REGISTRY.items():
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index d8170858c2a9a..f79de04a5ad06 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -7,6 +7,7 @@
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
+from vllm.transformers_utils.configs.deepseek_vl import DeepSeekMultiModalityConfig
 
 __all__ = [
     "ChatGLMConfig",
@@ -15,4 +16,5 @@
     "RWConfig",
     "JAISConfig",
     "MLPSpeculatorConfig",
+    "DeepSeekMultiModalityConfig",
 ]
diff --git a/vllm/transformers_utils/configs/deepseek_vl.py b/vllm/transformers_utils/configs/deepseek_vl.py
new file mode 100644
index 0000000000000..cfdf229531f38
--- /dev/null
+++ b/vllm/transformers_utils/configs/deepseek_vl.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import sys
+
+from transformers import AutoConfig
+from transformers import PretrainedConfig
+from transformers import LlamaConfig
+
+if sys.version_info >= (3, 10):
+    print("Python version is above 3.10, patching the collections module.")
+    # Monkey patch collections
+    import collections
+    import collections.abc
+
+    for type_name in collections.abc.__all__:
+        setattr(collections, type_name, getattr(collections.abc, type_name))
+    from attrdict import AttrDict
+
+
+class VisionConfig(PretrainedConfig):
+    model_type = "vision"
+    cls: str = ""
+    params: AttrDict = {}
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.cls = kwargs.get("cls", "")
+        if not isinstance(self.cls, str):
+            self.cls = self.cls.__name__
+
+        self.params = AttrDict(kwargs.get("params", {}))
+
+
+class AlignerConfig(PretrainedConfig):
+    model_type = "aligner"
+    cls: str = ""
+    params: AttrDict = {}
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.cls = kwargs.get("cls", "")
+        if not isinstance(self.cls, str):
+            self.cls = self.cls.__name__
+
+        self.params = AttrDict(kwargs.get("params", {}))
+
+
+class DeepSeekMultiModalityConfig(PretrainedConfig):
+    model_type = "multi_modality"
+    vision_config: VisionConfig
+    aligner_config: AlignerConfig
+    language_config: LlamaConfig
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        vision_config = kwargs.get("vision_config", {})
+        self.vision_config = VisionConfig(**vision_config)
+
+        aligner_config = kwargs.get("aligner_config", {})
+        self.aligner_config = AlignerConfig(**aligner_config)
+
+        language_config = kwargs.get("language_config", {})
+        if isinstance(language_config, LlamaConfig):
+            self.language_config = language_config
+        else:
+            self.language_config = LlamaConfig(**language_config)
+        self.text_config = self.language_config
+
+
+AutoConfig.register("multi_modality", DeepSeekMultiModalityConfig)

From de63a4cd553c0b8baa35a53fe2cb33bebcef6e6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Wed, 26 Jun 2024 12:00:02 +0800
Subject: [PATCH 02/47] fix requirement for deepseek-vl

---
 requirements-common.txt                       |   1 +
 vllm/model_executor/models/deepseek_vl.py     | 474 +++++++++---------
 .../transformers_utils/configs/deepseek_vl.py |  20 +-
 3 files changed, 240 insertions(+), 255 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 05969cfa5d65f..c0f3b14dc0896 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -21,3 +21,4 @@ lm-format-enforcer == 0.10.1
 outlines >= 0.0.43 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+timm >= 0.9.16 # Required for deepseek-vl model
\ No newline at end of file
diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py
index 88178d9d773a2..80ba6e7a5b911 100644
--- a/vllm/model_executor/models/deepseek_vl.py
+++ b/vllm/model_executor/models/deepseek_vl.py
@@ -21,8 +21,6 @@
 import warnings
 import copy
 
-from dataclasses import dataclass
-from functools import partial
 from dataclasses import dataclass
 from functools import partial
 from typing import (
@@ -46,19 +44,14 @@
 import numpy as np
 import torchvision
 import torchvision.transforms.functional
-import torch.nn.functional as F
-
-
-from einops import rearrange
 from transformers import PreTrainedModel
-from transformers.configuration_utils import PretrainedConfig
-from einops import rearrange
 from PIL import Image
 from transformers import AutoImageProcessor, PretrainedConfig
-from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_processing_utils import (
+    BaseImageProcessor,
+    BatchFeature,
+)
 from transformers.image_utils import to_numpy_array
-from einops import rearrange
-from transformers import PreTrainedModel
 from timm.layers import (
     AttentionPoolLatent,
     DropPath,
@@ -68,13 +61,13 @@
     PatchEmbed,
     resample_abs_pos_embed,
 )
-from timm.models._manipulate import checkpoint_seq, named_apply
-
+from timm.models._manipulate import checkpoint_seq
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, )
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -86,7 +79,6 @@
 from .vlm_base import VisionLanguageModelBase
 from vllm.transformers_utils.configs import DeepSeekMultiModalityConfig
 
-
 ImageType = Union[np.ndarray, torch.Tensor, Image.Image]
 IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
 IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
@@ -94,6 +86,18 @@
 IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)
 
 
+class AttrDict:
+
+    def __init__(self, entries):
+        for key, value in entries.items():
+            if isinstance(value, dict):
+                entries[key] = AttrDict(value)
+        self.__dict__.update(entries)
+
+    def get(self, key, default_val=None):
+        return self.__dict__.get(key, default_val)
+
+
 def expand2square(pil_img, background_color):
     width, height = pil_img.size
     if width == height:
@@ -205,7 +209,8 @@ def resize(self, pil_img: Image) -> np.ndarray:
         pil_img = torchvision.transforms.functional.resize(
             pil_img,
             size,
-            interpolation=torchvision.transforms.functional.InterpolationMode.BICUBIC,
+            interpolation=torchvision.transforms.functional.InterpolationMode.
+            BICUBIC,
             antialias=True,
         )
 
@@ -217,7 +222,10 @@ def resize(self, pil_img: Image) -> np.ndarray:
 
         return x
 
-    def preprocess(self, images, return_tensors: str = "pt", **kwargs) -> BatchFeature:
+    def preprocess(self,
+                   images,
+                   return_tensors: str = "pt",
+                   **kwargs) -> BatchFeature:
         # resize and pad to [self.image_size, self.image_size]
         # then convert from [H, W, 3] to [3, H, W]
         # print(images)
@@ -233,8 +241,7 @@ def preprocess(self, images, return_tensors: str = "pt", **kwargs) -> BatchFeatu
                 image=image,
                 scale=self.rescale_factor,
                 input_data_format="channels_first",
-            )
-            for image in images
+            ) for image in images
         ]
 
         # normalize
@@ -245,8 +252,7 @@ def preprocess(self, images, return_tensors: str = "pt", **kwargs) -> BatchFeatu
                     mean=self.image_mean,
                     std=self.image_std,
                     input_data_format="channels_first",
-                )
-                for image in images
+                ) for image in images
             ]
 
         data = {"pixel_values": images}
@@ -258,9 +264,10 @@ def default_shape(self):
 
 
 class MlpProjector(nn.Module):
+
     def __init__(self, cfg):
         super().__init__()
-
+        cfg = AttrDict(cfg)
         self.cfg = cfg
 
         if cfg.projector_type == "identity":
@@ -294,14 +301,16 @@ def __init__(self, cfg):
         self.layers = modules
 
     def forward(
-        self, x_or_tuple: Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]
+        self,
+        x_or_tuple: Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor],
     ):
         """
 
         Args:
-            x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:  if it is a tuple of torch.Tensor,
-                then it comes from the hybrid vision encoder, and x = high_res_x, low_res_x);
-                otherwise it is the feature from the single vision encoder.
+            x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:  
+            if it is a tuple of torch.Tensor,
+            then it comes from the hybrid vision encoder, and x = high_res_x, low_res_x);
+            otherwise it is the feature from the single vision encoder.
 
         Returns:
             x (torch.Tensor): [b, s, c]
@@ -360,7 +369,7 @@ def norm_cdf(x):
 def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
     # type: (torch.Tensor, float, float, float, float) -> torch.Tensor
     r"""The original timm.models.layers.weight_init.trunc_normal_ can not handle bfloat16 yet, here we first
-    convert the tensor to float32, apply the trunc_normal_() in float32, and then convert it back to its orignal dtype.
+    convert the tensor to float32, apply the trunc_normal_() in float32, and then convert it back to its original dtype.
     Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn
     from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
     with values outside :math:`[a, b]` redrawn until they are within
@@ -387,20 +396,10 @@ def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
 
 def init_weights(self):
     if self.pos_embed is not None:
-        trunc_normal_(self.pos_embed, std=self.pos_embed.shape[1] ** -0.5)
+        trunc_normal_(self.pos_embed, std=self.pos_embed.shape[1]**-0.5)
     trunc_normal_(self.latent, std=self.latent_dim**-0.5)
 
 
-def init_weights_vit_timm(module: nn.Module, name: str = "") -> None:
-    """ViT weight initialization, original timm impl (for reproducibility)"""
-    if isinstance(module, nn.Linear):
-        trunc_normal_(module.weight, std=0.02)
-        if module.bias is not None:
-            nn.init.zeros_(module.bias)
-    elif hasattr(module, "init_weights"):
-        module.init_weights()
-
-
 class SigLipAttention(nn.Module):
     fused_attn: Final[bool]
 
@@ -427,15 +426,13 @@ def __init__(
         self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
         self.attn_drop = nn.Dropout(attn_drop)
         self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop) if proj_drop > 0.0 else nn.Identity()
+        self.proj_drop = (nn.Dropout(proj_drop)
+                          if proj_drop > 0.0 else nn.Identity())
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
-        qkv = (
-            self.qkv(x)
-            .reshape(B, N, 3, self.num_heads, self.head_dim)
-            .permute(2, 0, 3, 1, 4)
-        )
+        qkv = (self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                   self.head_dim).permute(2, 0, 3, 1, 4))
         q, k, v = qkv.unbind(0)
         q, k = self.q_norm(q), self.k_norm(k)
 
@@ -460,6 +457,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class LayerScale(nn.Module):
+
     def __init__(
         self,
         dim: int,
@@ -475,6 +473,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class SigLipBlock(nn.Module):
+
     def __init__(
         self,
         dim: int,
@@ -501,10 +500,10 @@ def __init__(
             proj_drop=proj_drop,
             norm_layer=norm_layer,
         )
-        self.ls1 = (
-            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
-        )
-        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.ls1 = (LayerScale(dim, init_values=init_values)
+                    if init_values else nn.Identity())
+        self.drop_path1 = (DropPath(drop_path)
+                           if drop_path > 0.0 else nn.Identity())
 
         self.norm2 = norm_layer(dim)
         self.mlp = mlp_layer(
@@ -513,10 +512,10 @@ def __init__(
             act_layer=act_layer,
             drop=proj_drop,
         )
-        self.ls2 = (
-            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
-        )
-        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.ls2 = (LayerScale(dim, init_values=init_values)
+                    if init_values else nn.Identity())
+        self.drop_path2 = (DropPath(drop_path)
+                           if drop_path > 0.0 else nn.Identity())
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
@@ -560,7 +559,6 @@ def __init__(
         proj_drop_rate: float = 0.0,
         attn_drop_rate: float = 0.0,
         drop_path_rate: float = 0.0,
-        weight_init: Literal["skip", "jax", "jax_nlhb", "moco", ""] = "",
         embed_layer: Callable = PatchEmbed,
         norm_layer: Optional[LayerType] = None,
         act_layer: Optional[LayerType] = None,
@@ -573,7 +571,7 @@ def __init__(
             img_size: Input image size.
             patch_size: Patch size.
             in_chans: Number of image input channels.
-            num_classes: Mumber of classes for classification head.
+            num_classes: Number of classes for classification head.
             global_pool: Type of global pooling for final sequence (default: 'token').
             embed_dim: Transformer embedding dimension.
             depth: Depth of transformer.
@@ -635,16 +633,14 @@ def __init__(
         )
         num_patches = self.patch_embed.num_patches
 
-        self.cls_token = (
-            nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None
-        )
-        self.reg_token = (
-            nn.Parameter(torch.zeros(1, reg_tokens, embed_dim)) if reg_tokens else None
-        )
-        embed_len = (
-            num_patches if no_embed_class else num_patches + self.num_prefix_tokens
-        )
-        self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02)
+        self.cls_token = (nn.Parameter(torch.zeros(1, 1, embed_dim))
+                          if class_token else None)
+        self.reg_token = (nn.Parameter(torch.zeros(1, reg_tokens, embed_dim))
+                          if reg_tokens else None)
+        embed_len = (num_patches if no_embed_class else num_patches +
+                     self.num_prefix_tokens)
+        self.pos_embed = nn.Parameter(
+            torch.randn(1, embed_len, embed_dim) * 0.02)
         self.pos_drop = nn.Dropout(p=pos_drop_rate)
         if patch_drop_rate > 0:
             self.patch_drop = PatchDropout(
@@ -655,28 +651,24 @@ def __init__(
             self.patch_drop = nn.Identity()
         self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity()
 
-        dpr = [
-            x.item() for x in torch.linspace(0, drop_path_rate, depth)
-        ]  # stochastic depth decay rule
-        self.blocks = nn.Sequential(
-            *[
-                block_fn(
-                    dim=embed_dim,
-                    num_heads=num_heads,
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    qk_norm=qk_norm,
-                    init_values=init_values,
-                    proj_drop=proj_drop_rate,
-                    attn_drop=attn_drop_rate,
-                    drop_path=dpr[i],
-                    norm_layer=norm_layer,
-                    act_layer=act_layer,
-                    mlp_layer=mlp_layer,
-                )
-                for i in range(depth)
-            ]
-        )
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(*[
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_norm=qk_norm,
+                init_values=init_values,
+                proj_drop=proj_drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                mlp_layer=mlp_layer,
+            ) for i in range(depth)
+        ])
         self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity()
 
         # Classifier Head
@@ -692,20 +684,8 @@ def __init__(
             self.attn_pool = None
         self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity()
         self.head_drop = nn.Dropout(drop_rate)
-        self.head = (
-            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-        )
-
-        if weight_init != "skip":
-            self.init_weights(weight_init)
-
-    def init_weights(self, mode: Literal["jax", "jax_nlhb", "moco", ""] = "") -> None:
-        assert mode in ("jax", "jax_nlhb", "moco", "")
-        # head_bias = -math.log(self.num_classes) if "nlhb" in mode else 0.0
-        trunc_normal_(self.pos_embed, std=0.02)
-        if self.cls_token is not None:
-            nn.init.normal_(self.cls_token, std=1e-6)
-        named_apply(init_weights_vit_timm, self)
+        self.head = (nn.Linear(self.embed_dim, num_classes)
+                     if num_classes > 0 else nn.Identity())
 
     @torch.jit.ignore
     def no_weight_decay(self) -> Set:
@@ -715,7 +695,7 @@ def no_weight_decay(self) -> Set:
     def group_matcher(self, coarse: bool = False) -> Dict:
         return dict(
             stem=r"^cls_token|pos_embed|patch_embed",  # stem and embed
-            blocks=[(r"^blocks\.(\d+)", None), (r"^norm", (99999,))],
+            blocks=[(r"^blocks\.(\d+)", None), (r"^norm", (99999, ))],
         )
 
     @torch.jit.ignore
@@ -731,15 +711,14 @@ def reset_classifier(self, num_classes: int, global_pool=None) -> None:
         if global_pool is not None:
             assert global_pool in ("", "avg", "token", "map")
             if global_pool == "map" and self.attn_pool is None:
-                assert (
-                    False
-                ), "Cannot currently add attention pooling in reset_classifier()."
+                raise AssertionError(
+                    "Cannot currently add attention pooling in reset_classifier()."
+                )
             elif global_pool != "map " and self.attn_pool is not None:
                 self.attn_pool = None  # remove attention pooling
             self.global_pool = global_pool
-        self.head = (
-            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
-        )
+        self.head = (nn.Linear(self.embed_dim, num_classes)
+                     if num_classes > 0 else nn.Identity())
 
     def _pos_embed(self, x: torch.Tensor) -> torch.Tensor:
         if self.dynamic_img_size:
@@ -747,7 +726,8 @@ def _pos_embed(self, x: torch.Tensor) -> torch.Tensor:
             pos_embed = resample_abs_pos_embed(
                 self.pos_embed,
                 (H, W),
-                num_prefix_tokens=0 if self.no_embed_class else self.num_prefix_tokens,
+                num_prefix_tokens=(0 if self.no_embed_class else
+                                   self.num_prefix_tokens),
             )
             x = x.view(B, -1, C)
         else:
@@ -781,8 +761,7 @@ def _intermediate_layers(
     ) -> List[torch.Tensor]:
         outputs, num_blocks = [], len(self.blocks)
         take_indices = set(
-            range(num_blocks - n, num_blocks) if isinstance(n, int) else n
-        )
+            range(num_blocks - n, num_blocks) if isinstance(n, int) else n)
 
         # forward pass
         x = self.patch_embed(x)
@@ -811,15 +790,14 @@ def get_intermediate_layers(
         outputs = self._intermediate_layers(x, n)
         if norm:
             outputs = [self.norm(out) for out in outputs]
-        prefix_tokens = [out[:, 0 : self.num_prefix_tokens] for out in outputs]
-        outputs = [out[:, self.num_prefix_tokens :] for out in outputs]
+        prefix_tokens = [out[:, 0:self.num_prefix_tokens] for out in outputs]
+        outputs = [out[:, self.num_prefix_tokens:] for out in outputs]
 
         if reshape:
             grid_size = self.patch_embed.grid_size
             outputs = [
-                out.reshape(x.shape[0], grid_size[0], grid_size[1], -1)
-                .permute(0, 3, 1, 2)
-                .contiguous()
+                out.reshape(x.shape[0], grid_size[0], grid_size[1],
+                            -1).permute(0, 3, 1, 2).contiguous()
                 for out in outputs
             ]
 
@@ -839,11 +817,13 @@ def forward_features(self, x: torch.Tensor) -> torch.Tensor:
         x = self.norm(x)
         return x
 
-    def forward_head(self, x: torch.Tensor, pre_logits: bool = False) -> torch.Tensor:
+    def forward_head(self,
+                     x: torch.Tensor,
+                     pre_logits: bool = False) -> torch.Tensor:
         if self.attn_pool is not None:
             x = self.attn_pool(x)
         elif self.global_pool == "avg":
-            x = x[:, self.num_prefix_tokens :].mean(dim=1)
+            x = x[:, self.num_prefix_tokens:].mean(dim=1)
         elif self.global_pool:
             x = x[:, 0]  # class token
         x = self.fc_norm(x)
@@ -912,9 +892,8 @@ def create_siglip_vit(
     ckpt_path: str = "",
     **kwargs,
 ):
-    assert (
-        model_name in SigLIP_MODEL_CONFIG.keys()
-    ), f"model name should be in {SigLIP_MODEL_CONFIG.keys()}"
+    assert (model_name in SigLIP_MODEL_CONFIG
+            ), f"model name should be in {SigLIP_MODEL_CONFIG.keys()}"
 
     vision_cfg = SigLIPVisionCfg(**SigLIP_MODEL_CONFIG[model_name])
 
@@ -933,7 +912,6 @@ def create_siglip_vit(
         class_token=vision_cfg.class_token,
         global_pool=vision_cfg.global_pool,
         ignore_head=kwargs.get("ignore_head", True),
-        weight_init=kwargs.get("weight_init", "skip"),
         num_classes=0,
     )
 
@@ -941,15 +919,14 @@ def create_siglip_vit(
         state_dict = torch.load(ckpt_path, map_location="cpu")
 
         incompatible_keys = model.load_state_dict(state_dict, strict=False)
-        print(
-            f"SigLIP-ViT restores from {ckpt_path},\n"
-            f"\tincompatible_keys:', {incompatible_keys}."
-        )
+        print(f"SigLIP-ViT restores from {ckpt_path},\n"
+              f"\tincompatible_keys:', {incompatible_keys}.")
 
     return model
 
 
 class MLPBlock(nn.Module):
+
     def __init__(
         self,
         embedding_dim: int,
@@ -968,6 +945,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
 # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
 class LayerNorm2d(nn.Module):
+
     def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
         super().__init__()
         self.weight = nn.Parameter(torch.ones(num_channels))
@@ -984,25 +962,26 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 # This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
 class ImageEncoderViT(nn.Module):
+
     def __init__(
-        self,
-        img_size: int = 1024,
-        patch_size: int = 16,
-        in_chans: int = 3,
-        embed_dim: int = 768,
-        depth: int = 12,
-        num_heads: int = 12,
-        mlp_ratio: float = 4.0,
-        out_chans: int = 256,
-        qkv_bias: bool = True,
-        norm_layer: Type[nn.Module] = nn.LayerNorm,
-        act_layer: Type[nn.Module] = nn.GELU,
-        use_abs_pos: bool = True,
-        use_rel_pos: bool = False,
-        rel_pos_zero_init: bool = True,
-        window_size: int = 0,
-        global_attn_indexes: Tuple[int, ...] = (),
-        downsample_channels: Tuple[int, ...] = (512, 1024),
+            self,
+            img_size: int = 1024,
+            patch_size: int = 16,
+            in_chans: int = 3,
+            embed_dim: int = 768,
+            depth: int = 12,
+            num_heads: int = 12,
+            mlp_ratio: float = 4.0,
+            out_chans: int = 256,
+            qkv_bias: bool = True,
+            norm_layer: Type[nn.Module] = nn.LayerNorm,
+            act_layer: Type[nn.Module] = nn.GELU,
+            use_abs_pos: bool = True,
+            use_rel_pos: bool = False,
+            rel_pos_zero_init: bool = True,
+            window_size: int = 0,
+            global_attn_indexes: Tuple[int, ...] = (),
+            downsample_channels: Tuple[int, ...] = (512, 1024),
     ) -> None:
         """
         Args:
@@ -1038,9 +1017,11 @@ def __init__(
             # Initialize absolute positional embedding with pretrain image size.
             self.pos_embed = nn.Parameter(
                 torch.zeros(
-                    1, img_size // patch_size, img_size // patch_size, embed_dim
-                )
-            )
+                    1,
+                    img_size // patch_size,
+                    img_size // patch_size,
+                    embed_dim,
+                ))
 
         self.blocks = nn.ModuleList()
         for i in range(depth):
@@ -1088,8 +1069,7 @@ def __init__(
                     stride=2,
                     padding=1,
                     bias=False,
-                )
-            )
+                ))
             in_channels = out_channels
         self.downsamples = nn.Sequential(*downsamples)
 
@@ -1113,13 +1093,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         x = self.neck(x.permute(0, 3, 1, 2))
         x_dtype = x.dtype
-        x = F.interpolate(
-            x.float(), size=(96, 96), mode="bilinear", align_corners=False
-        ).to(x_dtype)
+        x = F.interpolate(x.float(),
+                          size=(96, 96),
+                          mode="bilinear",
+                          align_corners=False).to(x_dtype)
         x = self.downsamples(x)
 
         if self.sam_hd:
-            first_global_feature = self.neck_hd(global_features[0].permute(0, 3, 1, 2))
+            first_global_feature = self.neck_hd(global_features[0].permute(
+                0, 3, 1, 2))
             x_dtype = first_global_feature.dtype
             first_global_feature = F.interpolate(
                 first_global_feature.float(),
@@ -1127,7 +1109,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 mode="bilinear",
                 align_corners=False,
             )
-            first_global_feature = self.downsamples(first_global_feature.to(x_dtype))
+            first_global_feature = self.downsamples(
+                first_global_feature.to(x_dtype))
             x = x + first_global_feature * self.hd_alpha_downsamples
 
         return x
@@ -1172,13 +1155,14 @@ def __init__(
             qkv_bias=qkv_bias,
             use_rel_pos=use_rel_pos,
             rel_pos_zero_init=rel_pos_zero_init,
-            input_size=input_size if window_size == 0 else (window_size, window_size),
+            input_size=(input_size if window_size == 0 else
+                        (window_size, window_size)),
         )
 
         self.norm2 = norm_layer(dim)
-        self.mlp = MLPBlock(
-            embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer
-        )
+        self.mlp = MLPBlock(embedding_dim=dim,
+                            mlp_dim=int(dim * mlp_ratio),
+                            act=act_layer)
 
         self.window_size = window_size
 
@@ -1237,32 +1221,29 @@ def __init__(
                 input_size is not None
             ), "Input size must be provided if using relative positional encoding."
             # initialize relative positional embeddings
-            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
-            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+            self.rel_pos_h = nn.Parameter(
+                torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(
+                torch.zeros(2 * input_size[1] - 1, head_dim))
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, H, W, _ = x.shape
         # qkv with shape (3, B, nHead, H * W, C)
-        qkv = (
-            self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        )
+        qkv = (self.qkv(x).reshape(B, H * W, 3, self.num_heads,
+                                   -1).permute(2, 0, 3, 1, 4))
         # q, k, v with shape (B * nHead, H * W, C)
         q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
 
         def do_attention(q, k, v):
             attn = (q * self.scale) @ k.transpose(-2, -1)
             if self.use_rel_pos:
-                attn = add_decomposed_rel_pos(
-                    attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)
-                )
+                attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h,
+                                              self.rel_pos_w, (H, W), (H, W))
 
             attn = attn.softmax(dim=-1)
-            x = (
-                (attn @ v)
-                .view(B, self.num_heads, H, W, -1)
-                .permute(0, 2, 3, 1, 4)
-                .reshape(B, H, W, -1)
-            )
+            x = ((attn @ v).view(B, self.num_heads, H, W,
+                                 -1).permute(0, 2, 3, 1,
+                                             4).reshape(B, H, W, -1))
 
             return x
 
@@ -1274,9 +1255,8 @@ def do_attention(q, k, v):
         return x
 
 
-def window_partition(
-    x: torch.Tensor, window_size: int
-) -> Tuple[torch.Tensor, Tuple[int, int]]:
+def window_partition(x: torch.Tensor,
+                     window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
     """
     Partition into non-overlapping windows with padding if needed.
     Args:
@@ -1295,10 +1275,10 @@ def window_partition(
         x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
     Hp, Wp = H + pad_h, W + pad_w
 
-    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
-    windows = (
-        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
-    )
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size,
+               window_size, C)
+    windows = (x.permute(0, 1, 3, 2, 4,
+                         5).contiguous().view(-1, window_size, window_size, C))
     return windows, (Hp, Wp)
 
 
@@ -1322,9 +1302,8 @@ def window_unpartition(
     Hp, Wp = pad_hw
     H, W = hw
     B = windows.shape[0] // (Hp * Wp // window_size // window_size)
-    x = windows.view(
-        B, Hp // window_size, Wp // window_size, window_size, window_size, -1
-    )
+    x = windows.view(B, Hp // window_size, Wp // window_size, window_size,
+                     window_size, -1)
     x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
 
     if Hp > H or Wp > W:
@@ -1332,7 +1311,8 @@ def window_unpartition(
     return x
 
 
-def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+def get_rel_pos(q_size: int, k_size: int,
+                rel_pos: torch.Tensor) -> torch.Tensor:
     """
     Get relative positional embeddings according to the relative positions of
         query and key sizes.
@@ -1353,14 +1333,16 @@ def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor
             size=max_rel_dist,
             mode="linear",
         )
-        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+        rel_pos_resized = rel_pos_resized.reshape(-1,
+                                                  max_rel_dist).permute(1, 0)
     else:
         rel_pos_resized = rel_pos
 
     # Scale the coords with short length if shapes for q and k are different.
     q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
     k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
-    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords -
+                       k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
 
     return rel_pos_resized[relative_coords.long()]
 
@@ -1397,11 +1379,8 @@ def add_decomposed_rel_pos(
     rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
     rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
 
-    attn = (
-        attn.view(B, q_h, q_w, k_h, k_w)
-        + rel_h[:, :, :, :, None]
-        + rel_w[:, :, :, None, :]
-    ).view(B, q_h * q_w, k_h * k_w)
+    attn = (attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] +
+            rel_w[:, :, :, None, :]).view(B, q_h * q_w, k_h * k_w)
 
     return attn
 
@@ -1430,7 +1409,11 @@ def __init__(
         super().__init__()
 
         self.proj = nn.Conv2d(
-            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+            in_chans,
+            embed_dim,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -1492,7 +1475,7 @@ def create_sam_vit(
     **kwargs,
 ):
     assert (
-        model_name in SAM_MODEL_CONFIG.keys()
+        model_name in SAM_MODEL_CONFIG
     ), f"model name: {model_name} should be in {SAM_MODEL_CONFIG.keys()}"
 
     sam_cfg = SAMViTCfg(**SAM_MODEL_CONFIG[model_name])
@@ -1521,6 +1504,7 @@ def create_sam_vit(
 
 
 class CLIPVisionTower(nn.Module):
+
     def __init__(
         self,
         model_name: str = "siglip_large_patch16_384",
@@ -1548,13 +1532,11 @@ def __init__(
         }
         vision_tower_params.update(kwargs)
         self.vision_tower, self.forward_kwargs = self.build_vision_tower(
-            vision_tower_params
-        )
+            vision_tower_params)
 
         if pixel_mean is not None and pixel_std is not None:
-            image_norm = torchvision.transforms.Normalize(
-                mean=pixel_mean, std=pixel_std
-            )
+            image_norm = torchvision.transforms.Normalize(mean=pixel_mean,
+                                                          std=pixel_std)
         else:
             image_norm = None
 
@@ -1573,7 +1555,8 @@ def build_vision_tower(self, vision_tower_params):
         else:  # huggingface
             from transformers import CLIPVisionModel
 
-            vision_tower = CLIPVisionModel.from_pretrained(**vision_tower_params)
+            vision_tower = CLIPVisionModel.from_pretrained(
+                **vision_tower_params)
             forward_kwargs = dict(output_hidden_states=True)
 
         return vision_tower, forward_kwargs
@@ -1583,18 +1566,18 @@ def feature_select(self, image_forward_outs):
             # the output has been the self.select_layer"s features
             image_features = image_forward_outs
         else:
-            image_features = image_forward_outs.hidden_states[self.select_layer]
+            image_features = image_forward_outs.hidden_states[
+                self.select_layer]
 
         if self.select_feature == "patch":
             # if the output has cls_token
             image_features = image_features[:, 1:]
-        elif self.select_feature == "cls_patch":
-            image_features = image_features
-        elif self.select_feature == "same":
+        elif (self.select_feature == "cls_patch"
+              or self.select_feature == "same"):
             image_features = image_features
-
         else:
-            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+            raise ValueError(
+                f"Unexpected select feature: {self.select_feature}")
         return image_features
 
     def forward(self, images):
@@ -1616,6 +1599,7 @@ def forward(self, images):
 
 
 class HybridVisionTower(nn.Module):
+
     def __init__(
         self,
         high_res_cfg: Dict,
@@ -1632,7 +1616,8 @@ def __init__(
         self.low_res_size = low_res_cfg["image_size"]
         self.concat_type = concat_type
 
-        self.high_layer_norm = nn.LayerNorm(high_res_cfg.get("output_dim", 1024))
+        self.high_layer_norm = nn.LayerNorm(
+            high_res_cfg.get("output_dim", 1024))
         self.low_layer_norm = nn.LayerNorm(low_res_cfg.get("output_dim", 1024))
 
         if freeze_high:
@@ -1652,7 +1637,8 @@ def __init__(
                 p.requires_grad = False
             self.vision_tower_low = self.vision_tower_low.eval()
 
-        self.resize = torchvision.transforms.Resize(self.low_res_size, antialias=True)
+        self.resize = torchvision.transforms.Resize(self.low_res_size,
+                                                    antialias=True)
 
     def forward(self, images: torch.Tensor):
         """
@@ -1674,7 +1660,9 @@ def forward(self, images: torch.Tensor):
         # run high_res vision tower
         high_res = self.vision_tower_high(high_images)
         # [bs, c, h, w] -> [bs, h*w, c]
-        high_res = rearrange(high_res, "b c h w -> b (h w) c")
+        b, c, h, w = high_res.shape
+        high_res = torch.einsum("bchw->bhwc", high_res)
+        high_res = high_res.reshape(b, h * w, c)
         # run low_res vision tower
         low_res = self.vision_tower_low(low_images)
 
@@ -1730,16 +1718,19 @@ def __init__(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ):
-        super().__init__(
-            config,
-        )
+        super().__init__(config, )
         self.config = config
         vision_config = config.vision_config
-        self.image_size = vision_config.params.high_res_cfg.image_size
+        aligner_config = config.aligner_config
+        self.image_size = aligner_config.params["input_dim"]
+        self.image_size = vision_config.params.get("image_size")
+        if not self.image_size:
+            # Get image size for 7b model
+            self.image_size = vision_config.params["high_res_cfg"][
+                "image_size"]
         vision_cls = model_name_to_cls(vision_config.cls)
         self.vision_model = vision_cls(**vision_config.params)
         self.vision_tower = self.vision_model
-        aligner_config = config.aligner_config
         aligner_cls = model_name_to_cls(aligner_config.cls)
         self.aligner = aligner_cls(aligner_config.params)
 
@@ -1748,9 +1739,8 @@ def __init__(
         self.image_processor = VLMImageProcessor(self.image_size)
         self.logits_processor = LogitsProcessor(language_config.vocab_size)
         self.sampler = Sampler()
-        self.lm_head = ParallelLMHead(
-            language_config.vocab_size, language_config.hidden_size
-        )
+        self.lm_head = ParallelLMHead(language_config.vocab_size,
+                                      language_config.hidden_size)
 
     def sample(
         self,
@@ -1781,24 +1771,26 @@ def prepare_inputs_embeds(
         """
 
         bs, n = pixel_values.shape[0:2]
-        images = rearrange(pixel_values, "b n c h w -> (b n) c h w")
+        p_b, p_n, p_c, p_h, p_w = pixel_values.shape
+        images = pixel_values.reshape(p_b * p_n, p_c, p_h, p_w)
         # [b x n, T2, D]
-        images = images.to(self.vision_model.high_layer_norm.weight.dtype).to(
-            self.vision_model.high_layer_norm.weight.device
-        )
+        # images = images.to(self.vision_model.high_layer_norm.weight.dtype).to(
+        #     self.vision_model.high_layer_norm.weight.device
+        # )
         images_embeds = self.aligner(self.vision_model(images))
 
         # [b x n, T2, D] -> [b, n x T2, D]
-        images_embeds = rearrange(images_embeds, "(b n) t d -> b (n t) d", b=bs, n=n)
+        _, t, d = images_embeds.shape
+        images_embeds = images_embeds.reshape(bs, n * t, d)
 
         # [b, T, D]
         input_ids[input_ids < 0] = 0  # ignore the image embeddings
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids=input_ids)
+        inputs_embeds = self.language_model.get_input_embeddings(
+            input_ids=input_ids)
 
         # replace with the image embeddings
         images_embeds = images_embeds.reshape(
-            -1, self.config.aligner_config.params.n_embed
-        )
+            -1, self.config.aligner_config.params["n_embed"])
         inputs_embeds[images_seq_mask] = images_embeds
 
         return inputs_embeds
@@ -1820,7 +1812,8 @@ def forward(
             image_token_mask = input_ids == image_token_id
             inputs_embeds = self.prepare_inputs_embeds(
                 input_ids,
-                pixel_values.reshape(1, -1, 3, self.image_size, self.image_size),
+                pixel_values.reshape(1, -1, 3, self.image_size,
+                                     self.image_size),
                 image_token_mask,
             )
 
@@ -1829,17 +1822,19 @@ def forward(
             inputs_embeds = None
 
         hidden_states = self.language_model(
-            input_ids, positions, kv_caches, attn_metadata, inputs_embeds=inputs_embeds
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata,
+            inputs_embeds=inputs_embeds,
         )
 
         return hidden_states
 
-    def compute_logits(
-        self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata
-    ) -> torch.Tensor:
-        logits = self.logits_processor(
-            self.lm_head.weight, hidden_states, sampling_metadata
-        )
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+                                       sampling_metadata)
         return logits
 
     def load_weights(self, weights):
@@ -1857,12 +1852,14 @@ def load_weights(self, weights):
                 self.lm_head.weight_loader(self.lm_head.weight, loaded_weight)
                 continue
             if name.startswith("language_model"):
-                name = name.replace("language_model.model.", "language_model.", 1)
+                name = name.replace("language_model.model.", "language_model.",
+                                    1)
             if "rotary_emb.inv_freq" in name:
                 continue
             if "language_model" not in name:
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
                 weight_loader(param, loaded_weight)
                 continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
@@ -1874,9 +1871,8 @@ def load_weights(self, weights):
                 if name.endswith(".bias") and name not in params_dict:
                     continue
                 # Skip experts that are not assigned to this worker.
-                if (
-                    "mlp.experts." in name or "mlp.shared_experts." in name
-                ) and name not in params_dict:
+                if ("mlp.experts." in name or "mlp.shared_experts."
+                        in name) and name not in params_dict:
                     continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
@@ -1887,14 +1883,14 @@ def load_weights(self, weights):
                 # if name.endswith(".bias") and name not in params_dict:
                 #     continue
                 # Skip experts that are not assigned to this worker.
-                if (
-                    "mlp.experts." in name or "mlp.shared_experts." in name
-                ) and name not in params_dict:
+                if ("mlp.experts." in name or "mlp.shared_experts."
+                        in name) and name not in params_dict:
                     continue
                 if name not in params_dict:
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
                 weight_loader(param, loaded_weight)
 
 
diff --git a/vllm/transformers_utils/configs/deepseek_vl.py b/vllm/transformers_utils/configs/deepseek_vl.py
index cfdf229531f38..b14d8cbdf5b3c 100644
--- a/vllm/transformers_utils/configs/deepseek_vl.py
+++ b/vllm/transformers_utils/configs/deepseek_vl.py
@@ -17,27 +17,15 @@
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-import sys
-
 from transformers import AutoConfig
 from transformers import PretrainedConfig
 from transformers import LlamaConfig
 
-if sys.version_info >= (3, 10):
-    print("Python version is above 3.10, patching the collections module.")
-    # Monkey patch collections
-    import collections
-    import collections.abc
-
-    for type_name in collections.abc.__all__:
-        setattr(collections, type_name, getattr(collections.abc, type_name))
-    from attrdict import AttrDict
-
 
 class VisionConfig(PretrainedConfig):
     model_type = "vision"
     cls: str = ""
-    params: AttrDict = {}
+    params: dict = {}
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -46,13 +34,13 @@ def __init__(self, **kwargs):
         if not isinstance(self.cls, str):
             self.cls = self.cls.__name__
 
-        self.params = AttrDict(kwargs.get("params", {}))
+        self.params = kwargs.get("params", {})
 
 
 class AlignerConfig(PretrainedConfig):
     model_type = "aligner"
     cls: str = ""
-    params: AttrDict = {}
+    params: dict = {}
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -61,7 +49,7 @@ def __init__(self, **kwargs):
         if not isinstance(self.cls, str):
             self.cls = self.cls.__name__
 
-        self.params = AttrDict(kwargs.get("params", {}))
+        self.params = kwargs.get("params", {})
 
 
 class DeepSeekMultiModalityConfig(PretrainedConfig):

From 10b5cddb5e6492a274d05dfd615179d886888ea4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Wed, 26 Jun 2024 17:27:28 +0800
Subject: [PATCH 03/47] Removed unused code, added documentation, added
 examples

---
 docs/source/models/supported_models.rst   |   4 +
 examples/deepseek_vl_example.py           | 125 ++++++++++++++++++++++
 vllm/model_executor/models/deepseek_vl.py |  13 +--
 3 files changed, 131 insertions(+), 11 deletions(-)
 create mode 100644 examples/deepseek_vl_example.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 47737ae525209..86e9304e50272 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -163,6 +163,10 @@ Alongside each architecture, we include some popular models that use it.
     - Xverse
     - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc.
     -
+  * - :code:`DeepSeekMultiModalityCausalLM`
+    - deepseek-ai
+    - :code:`deepseek-ai/deepseek-vl-1.3b-chat`, :code:`deepseek-ai/deepseek-vl-7b-chat`, etc.
+    -
 
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py
new file mode 100644
index 0000000000000..ec610a622ad35
--- /dev/null
+++ b/examples/deepseek_vl_example.py
@@ -0,0 +1,125 @@
+import argparse
+import os
+import subprocess
+
+import torch
+from PIL import Image
+
+from vllm import LLM
+from vllm.multimodal.image import ImageFeatureData, ImagePixelData
+from vllm.model_executor.models.deepseek_vl import VLMImageProcessor
+
+# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
+# You can use `.buildkite/download-images.sh` to download them
+from vllm import SamplingParams
+
+sample_params = SamplingParams(temperature=0, max_tokens=1024)
+
+model = "deepseek-ai/deepseek-vl-7b-chat"
+
+
+def run_deepseek_vl_pixel_values(*, disable_image_processor: bool = False):
+    llm = LLM(
+        model=model,
+        image_input_type="pixel_values",
+        image_token_id=100015,
+        image_input_shape="1,3,1024,1024",
+        image_feature_size=576,
+        disable_image_processor=False,
+        gpu_memory_utilization=0.9,
+        max_model_len=3072,
+        enforce_eager=True,
+    )
+
+    prompt = f"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: {'<image_placeholder>'*576} Describe the content of this image.\nAssistant:"
+
+    if disable_image_processor:
+        image = get_image_features()
+    else:
+        image = Image.open("images/stop_sign.jpg")
+
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": ImagePixelData(image),
+        },
+        sampling_params=sample_params,
+    )
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+def run_deepseek_vl_image_features():
+    llm = LLM(
+        model=model,
+        image_input_type="image_features",
+        image_token_id=100015,
+        image_input_shape="1,3,1024,1024",
+        image_feature_size=576,
+        gpu_memory_utilization=0.9,
+        max_model_len=3072,
+        enforce_eager=True,
+    )
+    prompt = f"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: {'<image_placeholder>'*576} Describe the content of this image.\nAssistant:"
+
+    image: torch.Tensor = get_image_features()
+
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": ImageFeatureData(image),
+        },
+        sampling_params=sample_params,
+    )
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+def get_image_features():
+    image_feature = VLMImageProcessor(1024)(Image.open("images/stop_sign.jpg"))[
+        "pixel_values"
+    ]
+    torch.save(image_feature, "images/deepseek_vl_stop_sign.pt")
+    return torch.load("images/deepseek_vl_stop_sign.pt")
+
+
+def main(args):
+    if args.type == "pixel_values":
+        run_deepseek_vl_pixel_values()
+    else:
+        run_deepseek_vl_image_features()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Demo on deepseek-vl")
+    parser.add_argument(
+        "--type",
+        type=str,
+        choices=["pixel_values", "image_features"],
+        default="pixel_values",
+        help="image input type",
+    )
+    args = parser.parse_args()
+    # Download from s3
+    s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
+    local_directory = "images"
+
+    # Make sure the local directory exists or create it
+    os.makedirs(local_directory, exist_ok=True)
+
+    # Use AWS CLI to sync the directory, assume anonymous access
+    subprocess.check_call(
+        [
+            "aws",
+            "s3",
+            "sync",
+            s3_bucket_path,
+            local_directory,
+            "--no-sign-request",
+        ]
+    )
+    main(args)
diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py
index 80ba6e7a5b911..ec1ff36e5ab0a 100644
--- a/vllm/model_executor/models/deepseek_vl.py
+++ b/vllm/model_executor/models/deepseek_vl.py
@@ -167,7 +167,7 @@ def __init__(
             0.27577711,
         ),
         rescale_factor: float = 1.0 / 255.0,
-        do_normalize: bool = True,
+        do_normalize: bool = False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -228,7 +228,6 @@ def preprocess(self,
                    **kwargs) -> BatchFeature:
         # resize and pad to [self.image_size, self.image_size]
         # then convert from [H, W, 3] to [3, H, W]
-        # print(images)
         if not isinstance(images, List):
             images = [
                 images,
@@ -1773,10 +1772,6 @@ def prepare_inputs_embeds(
         bs, n = pixel_values.shape[0:2]
         p_b, p_n, p_c, p_h, p_w = pixel_values.shape
         images = pixel_values.reshape(p_b * p_n, p_c, p_h, p_w)
-        # [b x n, T2, D]
-        # images = images.to(self.vision_model.high_layer_norm.weight.dtype).to(
-        #     self.vision_model.high_layer_norm.weight.device
-        # )
         images_embeds = self.aligner(self.vision_model(images))
 
         # [b x n, T2, D] -> [b, n x T2, D]
@@ -1805,7 +1800,7 @@ def forward(
     ):
         pixel_values = kwargs.pop("pixel_values", None)
         image_features = kwargs.pop("image_features", None)
-        if image_features and not pixel_values:
+        if image_features is not None and pixel_values is None:
             pixel_values = image_features
         if pixel_values is not None:
             image_token_id = 100015
@@ -1879,10 +1874,6 @@ def load_weights(self, weights):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                # Skip loading extra bias for GPTQ models.
-                # if name.endswith(".bias") and name not in params_dict:
-                #     continue
-                # Skip experts that are not assigned to this worker.
                 if ("mlp.experts." in name or "mlp.shared_experts."
                         in name) and name not in params_dict:
                     continue

From 09633373747b86beffbc4df13528a9358e388ffd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Wed, 26 Jun 2024 18:16:48 +0800
Subject: [PATCH 04/47] Added test cases, deleted model dependencies, and
 optimized code

---
 requirements-common.txt                   |   3 +-
 tests/models/test_deepseek_vl.py          | 128 ++++++++++++++++++++++
 vllm/model_executor/models/deepseek_vl.py |   2 +-
 3 files changed, 130 insertions(+), 3 deletions(-)
 create mode 100644 tests/models/test_deepseek_vl.py

diff --git a/requirements-common.txt b/requirements-common.txt
index c0f3b14dc0896..a0063062f4c36 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -20,5 +20,4 @@ tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.10.1
 outlines >= 0.0.43 # Requires torch >= 2.1.0
 typing_extensions
-filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
-timm >= 0.9.16 # Required for deepseek-vl model
\ No newline at end of file
+filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
\ No newline at end of file
diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
new file mode 100644
index 0000000000000..1b74592d5e571
--- /dev/null
+++ b/tests/models/test_deepseek_vl.py
@@ -0,0 +1,128 @@
+from typing import List, Tuple
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.config import VisionLanguageConfig
+
+from ..conftest import IMAGE_FILES
+
+pytestmark = pytest.mark.vlm
+
+# The image token is placed before "user" on purpose so that the test can pass
+HF_IMAGE_PROMPTS = [
+    "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: <image_placeholder>What's the content of the image?\nAssistant:",
+    "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: <image_placeholder>What is the season?\nAssistant:",
+]
+
+assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
+
+
+def iter_llava_configs(model_name: str):
+    image_hw_to_feature_size = {
+        (1024, 1024): 576,
+    }
+
+    for (h, w), f in image_hw_to_feature_size.items():
+        for input_type, input_shape in [
+            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
+            (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
+        ]:
+            yield (
+                model_name,
+                VisionLanguageConfig(
+                    image_input_type=input_type,
+                    image_feature_size=f,
+                    image_token_id=100015,
+                    image_input_shape=input_shape,
+                    image_processor=model_name,
+                    image_processor_revision=None,
+                ),
+            )
+
+
+model_and_vl_config = [
+    *iter_llava_configs("deepseek-ai/deepseek-vl-7b-chat"),
+]
+
+
+def vllm_to_hf_output(
+    vllm_output: Tuple[List[int], str], vlm_config: VisionLanguageConfig, model_id: str
+):
+    """Sanitize vllm output to be comparable with hf output.
+    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
+    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
+    It also reduces `output_str` from "<image_placeholder><image_placeholder>bla" to "bla".
+    """
+    input_ids, output_str = vllm_output
+    image_token_id = vlm_config.image_token_id
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    image_token_str = tokenizer.decode(image_token_id)
+
+    hf_input_ids = [
+        input_id
+        for idx, input_id in enumerate(input_ids)
+        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
+    ]
+    hf_output_str = output_str.replace(
+        image_token_str * vlm_config.image_feature_size, ""
+    )
+
+    return hf_input_ids, hf_output_str
+
+
+# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    hf_images,
+    vllm_images,
+    model_and_config,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalData objects and corresponding
+    vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    model_id, vlm_config = model_and_config
+
+    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs = hf_model.generate_greedy(
+            HF_IMAGE_PROMPTS, max_tokens, images=hf_images
+        )
+
+    vllm_image_prompts = [
+        p.replace(
+            "<image_placeholder>", "<image_placeholder>" * vlm_config.image_feature_size
+        )
+        for p in HF_IMAGE_PROMPTS
+    ]
+
+    with vllm_runner(
+        model_id, dtype=dtype, enforce_eager=True, **vlm_config.as_cli_args_dict()
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(
+            vllm_image_prompts, max_tokens, images=vllm_images
+        )
+
+    for i in range(len(HF_IMAGE_PROMPTS)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+            vllm_outputs[i], vlm_config, model_id
+        )
+        assert (
+            hf_output_str == vllm_output_str
+        ), f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}"
+        assert (
+            hf_output_ids == vllm_output_ids
+        ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}"
diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py
index ec1ff36e5ab0a..4f9ca839cac75 100644
--- a/vllm/model_executor/models/deepseek_vl.py
+++ b/vllm/model_executor/models/deepseek_vl.py
@@ -1734,7 +1734,7 @@ def __init__(
         self.aligner = aligner_cls(aligner_config.params)
 
         language_config = config.language_config
-        self.language_model = LlamaModel(language_config)
+        self.language_model = LlamaModel(language_config, cache_config, quant_config)
         self.image_processor = VLMImageProcessor(self.image_size)
         self.logits_processor = LogitsProcessor(language_config.vocab_size)
         self.sampler = Sampler()

From 9752b0c2362c288f8a7e32d8f7e37ef89a8bc15c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Thu, 27 Jun 2024 15:31:08 +0800
Subject: [PATCH 05/47] code reformat

---
 examples/deepseek_vl_example.py           | 23 ++++++------
 tests/models/test_deepseek_vl.py          | 43 ++++++++++-------------
 vllm/model_executor/models/deepseek_vl.py | 11 +++---
 3 files changed, 35 insertions(+), 42 deletions(-)

diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py
index ec610a622ad35..2032fb2a9473a 100644
--- a/examples/deepseek_vl_example.py
+++ b/examples/deepseek_vl_example.py
@@ -80,9 +80,8 @@ def run_deepseek_vl_image_features():
 
 
 def get_image_features():
-    image_feature = VLMImageProcessor(1024)(Image.open("images/stop_sign.jpg"))[
-        "pixel_values"
-    ]
+    image_feature = VLMImageProcessor(1024)(
+        Image.open("images/stop_sign.jpg"))["pixel_values"]
     torch.save(image_feature, "images/deepseek_vl_stop_sign.pt")
     return torch.load("images/deepseek_vl_stop_sign.pt")
 
@@ -112,14 +111,12 @@ def main(args):
     os.makedirs(local_directory, exist_ok=True)
 
     # Use AWS CLI to sync the directory, assume anonymous access
-    subprocess.check_call(
-        [
-            "aws",
-            "s3",
-            "sync",
-            s3_bucket_path,
-            local_directory,
-            "--no-sign-request",
-        ]
-    )
+    subprocess.check_call([
+        "aws",
+        "s3",
+        "sync",
+        s3_bucket_path,
+        local_directory,
+        "--no-sign-request",
+    ])
     main(args)
diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 1b74592d5e571..892e89422f2d4 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -46,9 +46,8 @@ def iter_llava_configs(model_name: str):
 ]
 
 
-def vllm_to_hf_output(
-    vllm_output: Tuple[List[int], str], vlm_config: VisionLanguageConfig, model_id: str
-):
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
+                      vlm_config: VisionLanguageConfig, model_id: str):
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
@@ -61,13 +60,11 @@ def vllm_to_hf_output(
     image_token_str = tokenizer.decode(image_token_id)
 
     hf_input_ids = [
-        input_id
-        for idx, input_id in enumerate(input_ids)
+        input_id for idx, input_id in enumerate(input_ids)
         if input_id != image_token_id or input_ids[idx - 1] != image_token_id
     ]
     hf_output_str = output_str.replace(
-        image_token_str * vlm_config.image_feature_size, ""
-    )
+        image_token_str * vlm_config.image_feature_size, "")
 
     return hf_input_ids, hf_output_str
 
@@ -97,32 +94,30 @@ def test_models(
     model_id, vlm_config = model_and_config
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs = hf_model.generate_greedy(
-            HF_IMAGE_PROMPTS, max_tokens, images=hf_images
-        )
+        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
+                                              max_tokens,
+                                              images=hf_images)
 
     vllm_image_prompts = [
-        p.replace(
-            "<image_placeholder>", "<image_placeholder>" * vlm_config.image_feature_size
-        )
+        p.replace("<image_placeholder>",
+                  "<image_placeholder>" * vlm_config.image_feature_size)
         for p in HF_IMAGE_PROMPTS
     ]
 
-    with vllm_runner(
-        model_id, dtype=dtype, enforce_eager=True, **vlm_config.as_cli_args_dict()
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(
-            vllm_image_prompts, max_tokens, images=vllm_images
-        )
+    with vllm_runner(model_id,
+                     dtype=dtype,
+                     enforce_eager=True,
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+                                                  max_tokens,
+                                                  images=vllm_images)
 
     for i in range(len(HF_IMAGE_PROMPTS)):
         hf_output_ids, hf_output_str = hf_outputs[i]
         vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id
-        )
+            vllm_outputs[i], vlm_config, model_id)
         assert (
             hf_output_str == vllm_output_str
         ), f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}"
-        assert (
-            hf_output_ids == vllm_output_ids
-        ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}"
+        assert (hf_output_ids == vllm_output_ids
+                ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}"
diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py
index 4f9ca839cac75..5d1033b85a416 100644
--- a/vllm/model_executor/models/deepseek_vl.py
+++ b/vllm/model_executor/models/deepseek_vl.py
@@ -1004,7 +1004,7 @@ def __init__(
         super().__init__()
         self.img_size = img_size
 
-        self.patch_embed = PatchEmbed(
+        self.patch_embed = ImagePatchEmbed(
             kernel_size=(patch_size, patch_size),
             stride=(patch_size, patch_size),
             in_chans=in_chans,
@@ -1384,7 +1384,7 @@ def add_decomposed_rel_pos(
     return attn
 
 
-class PatchEmbed(nn.Module):
+class ImagePatchEmbed(nn.Module):
     """
     Image to Patch Embedding.
     """
@@ -1551,8 +1551,8 @@ def build_vision_tower(self, vision_tower_params):
             vision_tower = create_sam_vit(**vision_tower_params)
             forward_kwargs = dict()
 
-        else:  # huggingface
-            from transformers import CLIPVisionModel
+        else:  
+            from vllm.model_executor.models.clip import CLIPVisionModel
 
             vision_tower = CLIPVisionModel.from_pretrained(
                 **vision_tower_params)
@@ -1734,7 +1734,8 @@ def __init__(
         self.aligner = aligner_cls(aligner_config.params)
 
         language_config = config.language_config
-        self.language_model = LlamaModel(language_config, cache_config, quant_config)
+        self.language_model = LlamaModel(language_config, cache_config,
+                                         quant_config)
         self.image_processor = VLMImageProcessor(self.image_size)
         self.logits_processor = LogitsProcessor(language_config.vocab_size)
         self.sampler = Sampler()

From de6879e11a2e3735377a905e5845b8f238bc0d4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Fri, 28 Jun 2024 11:03:11 +0800
Subject: [PATCH 06/47] Remove timm dependency and Code Formatting

---
 vllm/model_executor/models/deepseek_vl.py | 519 +++++++++++++++++++++-
 1 file changed, 498 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py
index 5d1033b85a416..ea37399c5b8de 100644
--- a/vllm/model_executor/models/deepseek_vl.py
+++ b/vllm/model_executor/models/deepseek_vl.py
@@ -20,8 +20,10 @@
 import math
 import warnings
 import copy
+import collections.abc
 
 from dataclasses import dataclass
+from enum import Enum
 from functools import partial
 from typing import (
     Callable,
@@ -36,11 +38,14 @@
     Type,
     Union,
 )
+from itertools import repeat
 
 import torch
 import torch.nn as nn
 import torchvision.transforms
 import torch.nn.functional as F
+from torch import _assert
+from torch.utils.checkpoint import checkpoint
 import numpy as np
 import torchvision
 import torchvision.transforms.functional
@@ -52,16 +57,6 @@
     BatchFeature,
 )
 from transformers.image_utils import to_numpy_array
-from timm.layers import (
-    AttentionPoolLatent,
-    DropPath,
-    LayerType,
-    Mlp,
-    PatchDropout,
-    PatchEmbed,
-    resample_abs_pos_embed,
-)
-from timm.models._manipulate import checkpoint_seq
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
@@ -84,6 +79,490 @@
 IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
 IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5)
 IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)
+LayerType = Union[str, Callable, Type[torch.nn.Module]]
+
+
+# From PyTorch internals
+def _ntuple(n):
+
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_2tuple = _ntuple(2)
+
+
+class Format(str, Enum):
+    NCHW = "NCHW"
+    NHWC = "NHWC"
+    NCL = "NCL"
+    NLC = "NLC"
+
+
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/attention_pool.py
+class AttentionPoolLatent(nn.Module):
+    """Attention pooling w/ latent query"""
+
+    fused_attn: torch.jit.Final[bool]
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int = None,
+        embed_dim: int = None,
+        num_heads: int = 8,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        latent_len: int = 1,
+        latent_dim: int = None,
+        pos_embed: str = "",
+        pool_type: str = "token",
+        norm_layer: Optional[nn.Module] = None,
+        drop: float = 0.0,
+    ):
+        super().__init__()
+        embed_dim = embed_dim or in_features
+        out_features = out_features or in_features
+        assert embed_dim % num_heads == 0
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.pool = pool_type
+        self.fused_attn = True
+
+        if pos_embed == "abs":
+            spatial_len = self.feat_size
+            self.pos_embed = nn.Parameter(torch.zeros(spatial_len,
+                                                      in_features))
+        else:
+            self.pos_embed = None
+
+        self.latent_dim = latent_dim or embed_dim
+        self.latent_len = latent_len
+        self.latent = nn.Parameter(torch.zeros(1, self.latent_len, embed_dim))
+
+        self.q = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+        self.kv = nn.Linear(embed_dim, embed_dim * 2, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = nn.Dropout(drop)
+
+        self.norm = (norm_layer(out_features)
+                     if norm_layer is not None else nn.Identity())
+        self.mlp = Mlp(embed_dim, int(embed_dim * mlp_ratio))
+
+    def forward(self, x):
+        B, N, C = x.shape
+
+        if self.pos_embed is not None:
+            # FIXME interpolate
+            x = x + self.pos_embed.unsqueeze(0).to(x.dtype)
+
+        q_latent = self.latent.expand(B, -1, -1)
+        q = (self.q(q_latent).reshape(B, self.latent_len, self.num_heads,
+                                      self.head_dim).transpose(1, 2))
+
+        kv = (self.kv(x).reshape(B, N, 2, self.num_heads,
+                                 self.head_dim).permute(2, 0, 3, 1, 4))
+        k, v = kv.unbind(0)
+
+        q, k = self.q_norm(q), self.k_norm(k)
+
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(q, k, v)
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, self.latent_len, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        x = x + self.mlp(self.norm(x))
+
+        # optional pool if latent seq_len > 1 and pooled output is desired
+        if self.pool == "token":
+            x = x[:, 0]
+        elif self.pool == "avg":
+            x = x.mean(1)
+        return x
+
+
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+def drop_path(x,
+              drop_prob: float = 0.0,
+              training: bool = False,
+              scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0], ) + (1, ) * (
+        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob,3):0.3f}"
+
+
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/mlp.py
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d,
+                               kernel_size=1) if use_conv else nn.Linear
+
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = (norm_layer(hidden_features)
+                     if norm_layer is not None else nn.Identity())
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+
+
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/patch_dropout.py
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794
+    """
+
+    return_indices: torch.jit.Final[bool]
+
+    def __init__(
+        self,
+        prob: float = 0.5,
+        num_prefix_tokens: int = 1,
+        ordered: bool = False,
+        return_indices: bool = False,
+    ):
+        super().__init__()
+        assert 0 <= prob < 1.0
+        self.prob = prob
+        self.num_prefix_tokens = (
+            num_prefix_tokens  # exclude CLS token (or other prefix tokens)
+        )
+        self.ordered = ordered
+        self.return_indices = return_indices
+
+    def forward(
+            self, x
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor]]]:
+        if not self.training or self.prob == 0.0:
+            if self.return_indices:
+                return x, None
+            return x
+
+        if self.num_prefix_tokens:
+            prefix_tokens, x = (
+                x[:, :self.num_prefix_tokens],
+                x[:, self.num_prefix_tokens:],
+            )
+        else:
+            prefix_tokens = None
+
+        B = x.shape[0]
+        L = x.shape[1]
+        num_keep = max(1, int(L * (1.0 - self.prob)))
+        keep_indices = torch.argsort(torch.randn(B, L, device=x.device),
+                                     dim=-1)[:, :num_keep]
+        if self.ordered:
+            # NOTE does not need to maintain patch order in typical transformer use,
+            # but possibly useful for debug / visualization
+            keep_indices = keep_indices.sort(dim=-1)[0]
+        x = x.gather(1,
+                     keep_indices.unsqueeze(-1).expand((-1, -1) + x.shape[2:]))
+
+        if prefix_tokens is not None:
+            x = torch.cat((prefix_tokens, x), dim=1)
+
+        if self.return_indices:
+            return x, keep_indices
+        return x
+
+
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/patch_embed.py
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+
+    output_fmt: Format
+    dynamic_img_pad: torch.jit.Final[bool]
+
+    def __init__(
+        self,
+        img_size: Optional[int] = 224,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten: bool = True,
+        output_fmt: Optional[str] = None,
+        bias: bool = True,
+        strict_img_size: bool = True,
+        dynamic_img_pad: bool = False,
+    ):
+        super().__init__()
+        self.patch_size = to_2tuple(patch_size)
+        if img_size is not None:
+            self.img_size = to_2tuple(img_size)
+            self.grid_size = tuple(
+                [s // p for s, p in zip(self.img_size, self.patch_size)])
+            self.num_patches = self.grid_size[0] * self.grid_size[1]
+        else:
+            self.img_size = None
+            self.grid_size = None
+            self.num_patches = None
+
+        if output_fmt is not None:
+            self.flatten = False
+            self.output_fmt = Format(output_fmt)
+        else:
+            # flatten spatial dim and transpose to channels last, kept for bwd compat
+            self.flatten = flatten
+            self.output_fmt = Format.NCHW
+        self.strict_img_size = strict_img_size
+        self.dynamic_img_pad = dynamic_img_pad
+
+        self.proj = nn.Conv2d(in_chans,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size,
+                              bias=bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def feat_ratio(self, as_scalar=True) -> Union[Tuple[int, int], int]:
+        if as_scalar:
+            return max(self.patch_size)
+        else:
+            return self.patch_size
+
+    def dynamic_feat_size(self, img_size: Tuple[int, int]) -> Tuple[int, int]:
+        """Get grid (feature) size for given image size taking account of dynamic padding.
+        NOTE: must be torchscript compatible so using fixed tuple indexing
+        """
+        if self.dynamic_img_pad:
+            return math.ceil(img_size[0] / self.patch_size[0]), math.ceil(
+                img_size[1] / self.patch_size[1])
+        else:
+            return img_size[0] // self.patch_size[0], img_size[
+                1] // self.patch_size[1]
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        if self.img_size is not None:
+            if self.strict_img_size:
+                _assert(
+                    H == self.img_size[0],
+                    f"Input height ({H}) doesn't match model ({self.img_size[0]}).",
+                )
+                _assert(
+                    W == self.img_size[1],
+                    f"Input width ({W}) doesn't match model ({self.img_size[1]}).",
+                )
+            elif not self.dynamic_img_pad:
+                _assert(
+                    H % self.patch_size[0] == 0,
+                    f"Input height ({H}) should be divisible by patch size ({self.patch_size[0]}).",
+                )
+                _assert(
+                    W % self.patch_size[1] == 0,
+                    f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]}).",
+                )
+        if self.dynamic_img_pad:
+            pad_h = (self.patch_size[0] -
+                     H % self.patch_size[0]) % self.patch_size[0]
+            pad_w = (self.patch_size[1] -
+                     W % self.patch_size[1]) % self.patch_size[1]
+            x = F.pad(x, (0, pad_w, 0, pad_h))
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # NCHW -> NLC
+        elif self.output_fmt != Format.NCHW:
+            x = nchw_to(x, self.output_fmt)
+        x = self.norm(x)
+        return x
+
+
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/pos_embed.py
+def resample_abs_pos_embed(
+    posemb,
+    new_size: List[int],
+    old_size: Optional[List[int]] = None,
+    num_prefix_tokens: int = 1,
+    interpolation: str = "bicubic",
+    antialias: bool = True,
+    verbose: bool = False,
+):
+    # sort out sizes, assume square if old size not provided
+    num_pos_tokens = posemb.shape[1]
+    num_new_tokens = new_size[0] * new_size[1] + num_prefix_tokens
+    if num_new_tokens == num_pos_tokens and new_size[0] == new_size[1]:
+        return posemb
+
+    if old_size is None:
+        hw = int(math.sqrt(num_pos_tokens - num_prefix_tokens))
+        old_size = hw, hw
+
+    if num_prefix_tokens:
+        posemb_prefix, posemb = (
+            posemb[:, :num_prefix_tokens],
+            posemb[:, num_prefix_tokens:],
+        )
+    else:
+        posemb_prefix, posemb = None, posemb
+
+    # do the interpolation
+    embed_dim = posemb.shape[-1]
+    orig_dtype = posemb.dtype
+    posemb = posemb.float()  # interpolate needs float32
+    posemb = posemb.reshape(1, old_size[0], old_size[1],
+                            -1).permute(0, 3, 1, 2)
+    posemb = F.interpolate(posemb,
+                           size=new_size,
+                           mode=interpolation,
+                           antialias=antialias)
+    posemb = posemb.permute(0, 2, 3, 1).reshape(1, -1, embed_dim)
+    posemb = posemb.to(orig_dtype)
+
+    # add back extra (class, etc) prefix tokens
+    if posemb_prefix is not None:
+        posemb = torch.cat([posemb_prefix, posemb], dim=1)
+
+    if not torch.jit.is_scripting() and verbose:
+        print(f"Resized position embedding: {old_size} to {new_size}.")
+
+    return posemb
+
+
+def checkpoint_seq(functions,
+                   x,
+                   every=1,
+                   flatten=False,
+                   skip_last=False,
+                   preserve_rng_state=True):
+    r"""A helper function for checkpointing sequential models.
+
+    Sequential models execute a list of modules/functions in order
+    (sequentially). Therefore, we can divide such a sequence into segments
+    and checkpoint each segment. All segments except run in :func:`torch.no_grad`
+    manner, i.e., not storing the intermediate activations. The inputs of each
+    checkpointed segment will be saved for re-running the segment in the backward pass.
+
+    See :func:`~torch.utils.checkpoint.checkpoint` on how checkpointing works.
+
+    .. warning::
+        Checkpointing currently only supports :func:`torch.autograd.backward`
+        and only if its `inputs` argument is not passed. :func:`torch.autograd.grad`
+        is not supported.
+
+    .. warning:
+        At least one of the inputs needs to have :code:`requires_grad=True` if
+        grads are needed for model inputs, otherwise the checkpointed part of the
+        model won't have gradients.
+
+    Args:
+        functions: A :class:`torch.nn.Sequential` or the list of modules or functions to run sequentially.
+        x: A Tensor that is input to :attr:`functions`
+        every: checkpoint every-n functions (default: 1)
+        flatten (bool): flatten nn.Sequential of nn.Sequentials
+        skip_last (bool): skip checkpointing the last function in the sequence if True
+        preserve_rng_state (bool, optional, default=True):  Omit stashing and restoring
+            the RNG state during each checkpoint.
+
+    Returns:
+        Output of running :attr:`functions` sequentially on :attr:`*inputs`
+
+    Example:
+        >>> model = nn.Sequential(...)
+        >>> input_var = checkpoint_seq(model, input_var, every=2)
+    """
+
+    def run_function(start, end, functions):
+
+        def forward(_x):
+            for j in range(start, end + 1):
+                _x = functions[j](_x)
+            return _x
+
+        return forward
+
+    if isinstance(functions, torch.nn.Sequential):
+        functions = functions.children()
+    if flatten:
+        functions = chain.from_iterable(functions)
+    if not isinstance(functions, (tuple, list)):
+        functions = tuple(functions)
+
+    num_checkpointed = len(functions)
+    if skip_last:
+        num_checkpointed -= 1
+    end = -1
+    for start in range(0, num_checkpointed, every):
+        end = min(start + every - 1, num_checkpointed - 1)
+        x = checkpoint(
+            run_function(start, end, functions),
+            x,
+            preserve_rng_state=preserve_rng_state,
+        )
+    if skip_last:
+        return run_function(end + 1, len(functions) - 1, functions)(x)
+    return x
 
 
 class AttrDict:
@@ -306,7 +785,7 @@ def forward(
         """
 
         Args:
-            x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:  
+            x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
             if it is a tuple of torch.Tensor,
             then it comes from the hybrid vision encoder, and x = high_res_x, low_res_x);
             otherwise it is the feature from the single vision encoder.
@@ -417,7 +896,6 @@ def __init__(
         self.num_heads = num_heads
         self.head_dim = dim // num_heads
         self.scale = self.head_dim**-0.5
-        # self.fused_attn = use_fused_attn()
         self.fused_attn = True
 
         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
@@ -425,8 +903,8 @@ def __init__(
         self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
         self.attn_drop = nn.Dropout(attn_drop)
         self.proj = nn.Linear(dim, dim)
-        self.proj_drop = (nn.Dropout(proj_drop)
-                          if proj_drop > 0.0 else nn.Identity())
+        self.proj_drop = nn.Dropout(
+            proj_drop) if proj_drop > 0.0 else nn.Identity()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
@@ -501,8 +979,8 @@ def __init__(
         )
         self.ls1 = (LayerScale(dim, init_values=init_values)
                     if init_values else nn.Identity())
-        self.drop_path1 = (DropPath(drop_path)
-                           if drop_path > 0.0 else nn.Identity())
+        self.drop_path1 = DropPath(
+            drop_path) if drop_path > 0.0 else nn.Identity()
 
         self.norm2 = norm_layer(dim)
         self.mlp = mlp_layer(
@@ -513,8 +991,8 @@ def __init__(
         )
         self.ls2 = (LayerScale(dim, init_values=init_values)
                     if init_values else nn.Identity())
-        self.drop_path2 = (DropPath(drop_path)
-                           if drop_path > 0.0 else nn.Identity())
+        self.drop_path2 = DropPath(
+            drop_path) if drop_path > 0.0 else nn.Identity()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
@@ -1551,7 +2029,7 @@ def build_vision_tower(self, vision_tower_params):
             vision_tower = create_sam_vit(**vision_tower_params)
             forward_kwargs = dict()
 
-        else:  
+        else:
             from vllm.model_executor.models.clip import CLIPVisionModel
 
             vision_tower = CLIPVisionModel.from_pretrained(
@@ -1571,8 +2049,7 @@ def feature_select(self, image_forward_outs):
         if self.select_feature == "patch":
             # if the output has cls_token
             image_features = image_features[:, 1:]
-        elif (self.select_feature == "cls_patch"
-              or self.select_feature == "same"):
+        elif self.select_feature == "cls_patch" or self.select_feature == "same":
             image_features = image_features
         else:
             raise ValueError(

From 7cf06711bf30caba04f16873e6ddb58673cb7421 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Fri, 28 Jun 2024 15:41:46 +0800
Subject: [PATCH 07/47] fix test failed

---
 requirements-test.txt            |  3 +++
 tests/models/test_deepseek_vl.py | 20 +++++++++++---------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/requirements-test.txt b/requirements-test.txt
index 8b68e0e939669..df14077d12fcf 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -20,3 +20,6 @@ aiohttp
 
 # quantization
 bitsandbytes==0.42.0
+
+# Model
+deepseek_vl@git+https://github.com/deepseek-ai/DeepSeek-VL.git@681bffb
\ No newline at end of file
diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 892e89422f2d4..bdf5c69528d9c 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -1,6 +1,7 @@
 from typing import List, Tuple
 
 import pytest
+import deepseek_vl.models
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
@@ -18,7 +19,7 @@
 assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
 
 
-def iter_llava_configs(model_name: str):
+def iter_deepseek_vl_configs(model_name: str):
     image_hw_to_feature_size = {
         (1024, 1024): 576,
     }
@@ -42,7 +43,7 @@ def iter_llava_configs(model_name: str):
 
 
 model_and_vl_config = [
-    *iter_llava_configs("deepseek-ai/deepseek-vl-7b-chat"),
+    *iter_deepseek_vl_configs("deepseek-ai/deepseek-vl-7b-chat"),
 ]
 
 
@@ -93,6 +94,14 @@ def test_models(
     """
     model_id, vlm_config = model_and_config
 
+    with vllm_runner(model_id,
+                     dtype=dtype,
+                     enforce_eager=True,
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+                                                  max_tokens,
+                                                  images=vllm_images)
+
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
         hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
                                               max_tokens,
@@ -104,13 +113,6 @@ def test_models(
         for p in HF_IMAGE_PROMPTS
     ]
 
-    with vllm_runner(model_id,
-                     dtype=dtype,
-                     enforce_eager=True,
-                     **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
-                                                  max_tokens,
-                                                  images=vllm_images)
 
     for i in range(len(HF_IMAGE_PROMPTS)):
         hf_output_ids, hf_output_str = hf_outputs[i]

From d2d3eeb6e129b2583136307436c011625d1cd35e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Fri, 28 Jun 2024 16:53:22 +0800
Subject: [PATCH 08/47] Modify the deepseek-vl version number

---
 requirements-test.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-test.txt b/requirements-test.txt
index df14077d12fcf..e8b7d7e626748 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -22,4 +22,4 @@ aiohttp
 bitsandbytes==0.42.0
 
 # Model
-deepseek_vl@git+https://github.com/deepseek-ai/DeepSeek-VL.git@681bffb
\ No newline at end of file
+deepseek_vl@git+https://github.com/deepseek-ai/DeepSeek-VL.git@main
\ No newline at end of file

From 23311f65c97d4cf1f7d3a0402a6a6762348f0bf8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Mon, 1 Jul 2024 09:30:14 +0800
Subject: [PATCH 09/47] Delete failed test cases and dependencies to resolve
 conflicts

---
 requirements-test.txt                  |   5 +-
 tests/models/test_deepseek_vl.py       | 125 -------------------------
 vllm/model_executor/models/__init__.py |   1 -
 3 files changed, 1 insertion(+), 130 deletions(-)
 delete mode 100644 tests/models/test_deepseek_vl.py

diff --git a/requirements-test.txt b/requirements-test.txt
index e8b7d7e626748..3ebfc16547e44 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -19,7 +19,4 @@ sentence-transformers # required for embedding
 aiohttp
 
 # quantization
-bitsandbytes==0.42.0
-
-# Model
-deepseek_vl@git+https://github.com/deepseek-ai/DeepSeek-VL.git@main
\ No newline at end of file
+bitsandbytes==0.42.0
\ No newline at end of file
diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
deleted file mode 100644
index bdf5c69528d9c..0000000000000
--- a/tests/models/test_deepseek_vl.py
+++ /dev/null
@@ -1,125 +0,0 @@
-from typing import List, Tuple
-
-import pytest
-import deepseek_vl.models
-from transformers import AutoTokenizer
-
-from vllm.config import VisionLanguageConfig
-
-from ..conftest import IMAGE_FILES
-
-pytestmark = pytest.mark.vlm
-
-# The image token is placed before "user" on purpose so that the test can pass
-HF_IMAGE_PROMPTS = [
-    "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: <image_placeholder>What's the content of the image?\nAssistant:",
-    "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: <image_placeholder>What is the season?\nAssistant:",
-]
-
-assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
-
-
-def iter_deepseek_vl_configs(model_name: str):
-    image_hw_to_feature_size = {
-        (1024, 1024): 576,
-    }
-
-    for (h, w), f in image_hw_to_feature_size.items():
-        for input_type, input_shape in [
-            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-            (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
-        ]:
-            yield (
-                model_name,
-                VisionLanguageConfig(
-                    image_input_type=input_type,
-                    image_feature_size=f,
-                    image_token_id=100015,
-                    image_input_shape=input_shape,
-                    image_processor=model_name,
-                    image_processor_revision=None,
-                ),
-            )
-
-
-model_and_vl_config = [
-    *iter_deepseek_vl_configs("deepseek-ai/deepseek-vl-7b-chat"),
-]
-
-
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
-                      vlm_config: VisionLanguageConfig, model_id: str):
-    """Sanitize vllm output to be comparable with hf output.
-    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
-    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
-    It also reduces `output_str` from "<image_placeholder><image_placeholder>bla" to "bla".
-    """
-    input_ids, output_str = vllm_output
-    image_token_id = vlm_config.image_token_id
-
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    image_token_str = tokenizer.decode(image_token_id)
-
-    hf_input_ids = [
-        input_id for idx, input_id in enumerate(input_ids)
-        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
-    ]
-    hf_output_str = output_str.replace(
-        image_token_str * vlm_config.image_feature_size, "")
-
-    return hf_input_ids, hf_output_str
-
-
-# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
-@pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    hf_images,
-    vllm_images,
-    model_and_config,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test is under tests/images.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalData objects and corresponding
-    vision language config as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
-    model_id, vlm_config = model_and_config
-
-    with vllm_runner(model_id,
-                     dtype=dtype,
-                     enforce_eager=True,
-                     **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
-                                                  max_tokens,
-                                                  images=vllm_images)
-
-    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
-                                              max_tokens,
-                                              images=hf_images)
-
-    vllm_image_prompts = [
-        p.replace("<image_placeholder>",
-                  "<image_placeholder>" * vlm_config.image_feature_size)
-        for p in HF_IMAGE_PROMPTS
-    ]
-
-
-    for i in range(len(HF_IMAGE_PROMPTS)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id)
-        assert (
-            hf_output_str == vllm_output_str
-        ), f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}"
-        assert (hf_output_ids == vllm_output_ids
-                ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}"
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 4c708478f716a..e7ced618c7be7 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -21,7 +21,6 @@
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
-    "MultiModalityCausalLM": ("deepseek_vl", "DeepSeekMultiModalityCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),

From 89d785663931f6e2cb3f29ae3e7553bc62cee54b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Mon, 1 Jul 2024 09:33:19 +0800
Subject: [PATCH 10/47] resolve conflicts

---
 vllm/model_executor/models/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 69a65ff023bc9..b4f01a5dc98aa 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -22,6 +22,7 @@
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
     "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
+    "MultiModalityCausalLM": ("deepseek_vl", "DeepSeekMultiModalityCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),

From 1eb7d483dbfa768fbfd0424aa08068e8c307b976 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Mon, 1 Jul 2024 10:07:12 +0800
Subject: [PATCH 11/47] fix code bug

---
 vllm/model_executor/models/deepseek_vl.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py
index ea37399c5b8de..dded8443ff708 100644
--- a/vllm/model_executor/models/deepseek_vl.py
+++ b/vllm/model_executor/models/deepseek_vl.py
@@ -38,7 +38,7 @@
     Type,
     Union,
 )
-from itertools import repeat
+from itertools import (repeat, chain)
 
 import torch
 import torch.nn as nn
@@ -103,6 +103,16 @@ class Format(str, Enum):
     NLC = "NLC"
 
 
+def nchw_to(x: torch.Tensor, fmt: Format):
+    if fmt == Format.NHWC:
+        x = x.permute(0, 2, 3, 1)
+    elif fmt == Format.NLC:
+        x = x.flatten(2).transpose(1, 2)
+    elif fmt == Format.NCL:
+        x = x.flatten(2)
+    return x
+
+
 # From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/attention_pool.py
 class AttentionPoolLatent(nn.Module):
     """Attention pooling w/ latent query"""
@@ -408,11 +418,11 @@ def forward(self, x):
         if self.img_size is not None:
             if self.strict_img_size:
                 _assert(
-                    H == self.img_size[0],
+                    self.img_size[0] == H,
                     f"Input height ({H}) doesn't match model ({self.img_size[0]}).",
                 )
                 _assert(
-                    W == self.img_size[1],
+                    self.img_size[1] == W,
                     f"Input width ({W}) doesn't match model ({self.img_size[1]}).",
                 )
             elif not self.dynamic_img_pad:
@@ -490,6 +500,7 @@ def resample_abs_pos_embed(
     return posemb
 
 
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/_manipulate.py
 def checkpoint_seq(functions,
                    x,
                    every=1,

From 0f127c663ed054f073dab2dddc79049855ba9382 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Mon, 1 Jul 2024 15:49:31 +0800
Subject: [PATCH 12/47] fix error Line too long

---
 examples/deepseek_vl_example.py               |  17 +-
 vllm/model_executor/models/deepseek_vl.py     | 305 ++++++++----------
 vllm/transformers_utils/config.py             |   4 +-
 vllm/transformers_utils/configs/__init__.py   |   3 +-
 .../transformers_utils/configs/deepseek_vl.py |  12 +-
 5 files changed, 150 insertions(+), 191 deletions(-)

diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py
index 2032fb2a9473a..715464635d599 100644
--- a/examples/deepseek_vl_example.py
+++ b/examples/deepseek_vl_example.py
@@ -5,17 +5,21 @@
 import torch
 from PIL import Image
 
-from vllm import LLM
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
-from vllm.model_executor.models.deepseek_vl import VLMImageProcessor
-
 # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
 # You can use `.buildkite/download-images.sh` to download them
-from vllm import SamplingParams
+from vllm import LLM, SamplingParams
+from vllm.model_executor.models.deepseek_vl import VLMImageProcessor
+from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 
 sample_params = SamplingParams(temperature=0, max_tokens=1024)
 
 model = "deepseek-ai/deepseek-vl-7b-chat"
+prompt = "You are a helpful language and vision assistant." \
+    "You are able to understand the visual content that the user provides," \
+    "and assist the user with a variety of tasks using natural language.\n" \
+    "User: <image_placeholder> Describe the content of this image.\nAssistant:"
+
+prompt = prompt.replace("<image_placeholder>", "<image_placeholder>" * 576)
 
 
 def run_deepseek_vl_pixel_values(*, disable_image_processor: bool = False):
@@ -31,8 +35,6 @@ def run_deepseek_vl_pixel_values(*, disable_image_processor: bool = False):
         enforce_eager=True,
     )
 
-    prompt = f"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: {'<image_placeholder>'*576} Describe the content of this image.\nAssistant:"
-
     if disable_image_processor:
         image = get_image_features()
     else:
@@ -62,7 +64,6 @@ def run_deepseek_vl_image_features():
         max_model_len=3072,
         enforce_eager=True,
     )
-    prompt = f"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: {'<image_placeholder>'*576} Describe the content of this image.\nAssistant:"
 
     image: torch.Tensor = get_image_features()
 
diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py
index dded8443ff708..27fd994be94d2 100644
--- a/vllm/model_executor/models/deepseek_vl.py
+++ b/vllm/model_executor/models/deepseek_vl.py
@@ -1,68 +1,53 @@
 # Copyright (c) 2023-2024 DeepSeek.
 #
-# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# Permission is hereby granted,free of charge, to any person obtaining a copy of
 # this software and associated documentation files (the "Software"), to deal in
 # the Software without restriction, including without limitation the rights to
-# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
-# the Software, and to permit persons to whom the Software is furnished to do so,
+# use, copy, modify, merge, publish, distribute,sublicense,and/or sell copies of
+# the Software,and to permit persons to whom the Software is furnished to do so,
 # subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# IMPLIED,INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,FITNESS
 # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
+import collections.abc
+import copy
 import math
 import warnings
-import copy
-import collections.abc
-
 from dataclasses import dataclass
 from enum import Enum
 from functools import partial
-from typing import (
-    Callable,
-    Dict,
-    Final,
-    List,
-    Literal,
-    Optional,
-    Sequence,
-    Set,
-    Tuple,
-    Type,
-    Union,
-)
-from itertools import (repeat, chain)
+from itertools import chain, repeat
+from typing import (Callable, Dict, Final, List, Literal, Optional, Sequence,
+                    Set, Tuple, Type, Union)
 
+import numpy as np
 import torch
 import torch.nn as nn
-import torchvision.transforms
 import torch.nn.functional as F
-from torch import _assert
-from torch.utils.checkpoint import checkpoint
-import numpy as np
 import torchvision
+import torchvision.transforms
 import torchvision.transforms.functional
-from transformers import PreTrainedModel
 from PIL import Image
-from transformers import AutoImageProcessor, PretrainedConfig
-from transformers.image_processing_utils import (
-    BaseImageProcessor,
-    BatchFeature,
-)
+from torch import _assert
+from torch.utils.checkpoint import checkpoint
+from transformers import AutoImageProcessor, PretrainedConfig, PreTrainedModel
+from transformers.image_processing_utils import (BaseImageProcessor,
+                                                 BatchFeature)
 from transformers.image_utils import to_numpy_array
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig, )
+    QuantizationConfig)
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -71,9 +56,10 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import get_dummy_image_data
 from vllm.sequence import SamplerOutput
-from .vlm_base import VisionLanguageModelBase
 from vllm.transformers_utils.configs import DeepSeekMultiModalityConfig
 
+from .vlm_base import VisionLanguageModelBase
+
 ImageType = Union[np.ndarray, torch.Tensor, Image.Image]
 IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
 IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
@@ -113,7 +99,7 @@ def nchw_to(x: torch.Tensor, fmt: Format):
     return x
 
 
-# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/attention_pool.py
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/attention_pool.py # noqa
 class AttentionPoolLatent(nn.Module):
     """Attention pooling w/ latent query"""
 
@@ -205,20 +191,13 @@ def forward(self, x):
         return x
 
 
-# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
-def drop_path(x,
-              drop_prob: float = 0.0,
-              training: bool = False,
-              scale_by_keep: bool = True):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-
-    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-    'survival rate' as the argument.
-
-    """
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py # noqa
+def drop_path(
+    x,
+    drop_prob: float = 0.0,
+    training: bool = False,
+    scale_by_keep: bool = True,
+):
     if drop_prob == 0.0 or not training:
         return x
     keep_prob = 1 - drop_prob
@@ -230,9 +209,12 @@ def drop_path(x,
     return x * random_tensor
 
 
-# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py # noqa
 class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+    """
+    Drop paths (Stochastic Depth) per sample  
+    (when applied in main path of residual blocks).
+    """
 
     def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
         super(DropPath, self).__init__()
@@ -246,7 +228,7 @@ def extra_repr(self):
         return f"drop_prob={round(self.drop_prob,3):0.3f}"
 
 
-# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/mlp.py
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/mlp.py # noqa
 class Mlp(nn.Module):
     """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
 
@@ -266,8 +248,8 @@ def __init__(
         hidden_features = hidden_features or in_features
         bias = to_2tuple(bias)
         drop_probs = to_2tuple(drop)
-        linear_layer = partial(nn.Conv2d,
-                               kernel_size=1) if use_conv else nn.Linear
+        linear_layer = (partial(nn.Conv2d, kernel_size=1)
+                        if use_conv else nn.Linear)
 
         self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
         self.act = act_layer()
@@ -287,7 +269,7 @@ def forward(self, x):
         return x
 
 
-# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/patch_dropout.py
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/patch_dropout.py # noqa
 class PatchDropout(nn.Module):
     """
     https://arxiv.org/abs/2212.00794
@@ -333,7 +315,8 @@ def forward(
         keep_indices = torch.argsort(torch.randn(B, L, device=x.device),
                                      dim=-1)[:, :num_keep]
         if self.ordered:
-            # NOTE does not need to maintain patch order in typical transformer use,
+            # NOTE does not need to maintain patch order in typical
+            # transformer use,
             # but possibly useful for debug / visualization
             keep_indices = keep_indices.sort(dim=-1)[0]
         x = x.gather(1,
@@ -347,7 +330,7 @@ def forward(
         return x
 
 
-# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/patch_embed.py
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/patch_embed.py # noqa
 class PatchEmbed(nn.Module):
     """2D Image to Patch Embedding"""
 
@@ -383,17 +366,20 @@ def __init__(
             self.flatten = False
             self.output_fmt = Format(output_fmt)
         else:
-            # flatten spatial dim and transpose to channels last, kept for bwd compat
+            # flatten spatial dim and transpose to channels last,
+            # kept for bwd compat
             self.flatten = flatten
             self.output_fmt = Format.NCHW
         self.strict_img_size = strict_img_size
         self.dynamic_img_pad = dynamic_img_pad
 
-        self.proj = nn.Conv2d(in_chans,
-                              embed_dim,
-                              kernel_size=patch_size,
-                              stride=patch_size,
-                              bias=bias)
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=bias,
+        )
         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
 
     def feat_ratio(self, as_scalar=True) -> Union[Tuple[int, int], int]:
@@ -403,15 +389,18 @@ def feat_ratio(self, as_scalar=True) -> Union[Tuple[int, int], int]:
             return self.patch_size
 
     def dynamic_feat_size(self, img_size: Tuple[int, int]) -> Tuple[int, int]:
-        """Get grid (feature) size for given image size taking account of dynamic padding.
+        """Get grid (feature) size for given image size taking account 
+           of dynamic padding.
         NOTE: must be torchscript compatible so using fixed tuple indexing
         """
         if self.dynamic_img_pad:
             return math.ceil(img_size[0] / self.patch_size[0]), math.ceil(
                 img_size[1] / self.patch_size[1])
         else:
-            return img_size[0] // self.patch_size[0], img_size[
-                1] // self.patch_size[1]
+            return (
+                img_size[0] // self.patch_size[0],
+                img_size[1] // self.patch_size[1],
+            )
 
     def forward(self, x):
         B, C, H, W = x.shape
@@ -419,20 +408,20 @@ def forward(self, x):
             if self.strict_img_size:
                 _assert(
                     self.img_size[0] == H,
-                    f"Input height ({H}) doesn't match model ({self.img_size[0]}).",
+                    f"Input height ({H}) doesn't match model ({self.img_size[0]}).",  # noqa
                 )
                 _assert(
                     self.img_size[1] == W,
-                    f"Input width ({W}) doesn't match model ({self.img_size[1]}).",
+                    f"Input width ({W}) doesn't match model ({self.img_size[1]}).",  # noqa
                 )
             elif not self.dynamic_img_pad:
                 _assert(
                     H % self.patch_size[0] == 0,
-                    f"Input height ({H}) should be divisible by patch size ({self.patch_size[0]}).",
+                    f"Input height ({H}) should be divisible by patch size ({self.patch_size[0]}).",  # noqa
                 )
                 _assert(
                     W % self.patch_size[1] == 0,
-                    f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]}).",
+                    f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]}).",  # noqa
                 )
         if self.dynamic_img_pad:
             pad_h = (self.patch_size[0] -
@@ -449,7 +438,7 @@ def forward(self, x):
         return x
 
 
-# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/pos_embed.py
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/pos_embed.py # noqa
 def resample_abs_pos_embed(
     posemb,
     new_size: List[int],
@@ -500,49 +489,15 @@ def resample_abs_pos_embed(
     return posemb
 
 
-# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/_manipulate.py
-def checkpoint_seq(functions,
-                   x,
-                   every=1,
-                   flatten=False,
-                   skip_last=False,
-                   preserve_rng_state=True):
-    r"""A helper function for checkpointing sequential models.
-
-    Sequential models execute a list of modules/functions in order
-    (sequentially). Therefore, we can divide such a sequence into segments
-    and checkpoint each segment. All segments except run in :func:`torch.no_grad`
-    manner, i.e., not storing the intermediate activations. The inputs of each
-    checkpointed segment will be saved for re-running the segment in the backward pass.
-
-    See :func:`~torch.utils.checkpoint.checkpoint` on how checkpointing works.
-
-    .. warning::
-        Checkpointing currently only supports :func:`torch.autograd.backward`
-        and only if its `inputs` argument is not passed. :func:`torch.autograd.grad`
-        is not supported.
-
-    .. warning:
-        At least one of the inputs needs to have :code:`requires_grad=True` if
-        grads are needed for model inputs, otherwise the checkpointed part of the
-        model won't have gradients.
-
-    Args:
-        functions: A :class:`torch.nn.Sequential` or the list of modules or functions to run sequentially.
-        x: A Tensor that is input to :attr:`functions`
-        every: checkpoint every-n functions (default: 1)
-        flatten (bool): flatten nn.Sequential of nn.Sequentials
-        skip_last (bool): skip checkpointing the last function in the sequence if True
-        preserve_rng_state (bool, optional, default=True):  Omit stashing and restoring
-            the RNG state during each checkpoint.
-
-    Returns:
-        Output of running :attr:`functions` sequentially on :attr:`*inputs`
-
-    Example:
-        >>> model = nn.Sequential(...)
-        >>> input_var = checkpoint_seq(model, input_var, every=2)
-    """
+# From https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/_manipulate.py # noqa
+def checkpoint_seq(
+    functions,
+    x,
+    every=1,
+    flatten=False,
+    skip_last=False,
+    preserve_rng_state=True,
+):
 
     def run_function(start, end, functions):
 
@@ -798,7 +753,8 @@ def forward(
         Args:
             x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
             if it is a tuple of torch.Tensor,
-            then it comes from the hybrid vision encoder, and x = high_res_x, low_res_x);
+            then it comes from the hybrid vision encoder, 
+            and x = high_res_x, low_res_x);
             otherwise it is the feature from the single vision encoder.
 
         Returns:
@@ -818,8 +774,9 @@ def forward(
 
 
 def _no_grad_trunc_normal_(tensor, mean, std, a, b):
-    # Cut & paste from PyTorch official master until it's in a few official releases - RW
-    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    # Cut & paste from PyTorch official master until it's in a few official
+    # releases - RW Method based on
+    # https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
     def norm_cdf(x):
         # Computes standard normal cumulative distribution function
         return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
@@ -856,25 +813,6 @@ def norm_cdf(x):
 
 
 def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
-    # type: (torch.Tensor, float, float, float, float) -> torch.Tensor
-    r"""The original timm.models.layers.weight_init.trunc_normal_ can not handle bfloat16 yet, here we first
-    convert the tensor to float32, apply the trunc_normal_() in float32, and then convert it back to its original dtype.
-    Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn
-    from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \leq \text{mean} \leq b`.
-    Args:
-        tensor: an n-dimensional `torch.Tensor`
-        mean: the mean of the normal distribution
-        std: the standard deviation of the normal distribution
-        a: the minimum cutoff value
-        b: the maximum cutoff value
-    Examples:
-        >>> w = torch.empty(3, 5)
-        >>> nn.init.trunc_normal_(w)
-    """
-
     with torch.no_grad():
         dtype = tensor.dtype
         tensor_fp32 = tensor.float()
@@ -914,8 +852,8 @@ def __init__(
         self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
         self.attn_drop = nn.Dropout(attn_drop)
         self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(
-            proj_drop) if proj_drop > 0.0 else nn.Identity()
+        self.proj_drop = (nn.Dropout(proj_drop)
+                          if proj_drop > 0.0 else nn.Identity())
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
@@ -990,8 +928,8 @@ def __init__(
         )
         self.ls1 = (LayerScale(dim, init_values=init_values)
                     if init_values else nn.Identity())
-        self.drop_path1 = DropPath(
-            drop_path) if drop_path > 0.0 else nn.Identity()
+        self.drop_path1 = (DropPath(drop_path)
+                           if drop_path > 0.0 else nn.Identity())
 
         self.norm2 = norm_layer(dim)
         self.mlp = mlp_layer(
@@ -1002,8 +940,8 @@ def __init__(
         )
         self.ls2 = (LayerScale(dim, init_values=init_values)
                     if init_values else nn.Identity())
-        self.drop_path2 = DropPath(
-            drop_path) if drop_path > 0.0 else nn.Identity()
+        self.drop_path2 = (DropPath(drop_path)
+                           if drop_path > 0.0 else nn.Identity())
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
@@ -1014,7 +952,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class VisionTransformer(nn.Module):
     """Vision Transformer
 
-    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+    A PyTorch impl of : `An Image is Worth 16x16 Words: 
+    Transformers for Image Recognition at Scale`
         - https://arxiv.org/abs/2010.11929
     """
 
@@ -1060,17 +999,21 @@ def __init__(
             patch_size: Patch size.
             in_chans: Number of image input channels.
             num_classes: Number of classes for classification head.
-            global_pool: Type of global pooling for final sequence (default: 'token').
+            global_pool: Type of global pooling for final sequence 
+            (default: 'token').
             embed_dim: Transformer embedding dimension.
             depth: Depth of transformer.
             num_heads: Number of attention heads.
             mlp_ratio: Ratio of mlp hidden dim to embedding dim.
             qkv_bias: Enable bias for qkv projections if True.
-            init_values: Layer-scale init values (layer-scale enabled if not None).
+            init_values: Layer-scale init values 
+            (layer-scale enabled if not None).
             class_token: Use class token.
-            no_embed_class: Don't include position embeddings for class (or reg) tokens.
+            no_embed_class: Don't include position embeddings for class 
+            (or reg) tokens.
             reg_tokens: Number of register tokens.
-            fc_norm: Pre head norm after pool (instead of before), if None, enabled when global_pool == 'avg'.
+            fc_norm: Pre head norm after pool (instead of before), if None, 
+            enabled when global_pool == 'avg'.
             drop_rate: Head dropout rate.
             pos_drop_rate: Position embedding dropout rate.
             attn_drop_rate: Attention dropout rate.
@@ -1085,8 +1028,7 @@ def __init__(
         assert global_pool in ("", "avg", "token", "map")
         assert class_token or global_pool != "token"
         use_fc_norm = global_pool == "avg" if fc_norm is None else fc_norm
-        # norm_layer = get_norm_layer(norm_layer) or partial(nn.LayerNorm, eps=1e-6)
-        # act_layer = get_act_layer(act_layer) or nn.GELU
+
         norm_layer = partial(nn.LayerNorm, eps=1e-6)
         act_layer = nn.GELU
 
@@ -1200,7 +1142,7 @@ def reset_classifier(self, num_classes: int, global_pool=None) -> None:
             assert global_pool in ("", "avg", "token", "map")
             if global_pool == "map" and self.attn_pool is None:
                 raise AssertionError(
-                    "Cannot currently add attention pooling in reset_classifier()."
+                    "Cannot currently add attention pooling in reset_classifier()."  # noqa
                 )
             elif global_pool != "map " and self.attn_pool is not None:
                 self.attn_pool = None  # remove attention pooling
@@ -1229,7 +1171,8 @@ def _pos_embed(self, x: torch.Tensor) -> torch.Tensor:
 
         if self.no_embed_class:
             # deit-3, updated JAX (big vision)
-            # position embedding does not overlap with class token, add then concat
+            # position embedding does not overlap with class token,
+            # add then concat
             x = x + pos_embed
             if to_cat:
                 x = torch.cat(to_cat + [x], dim=1)
@@ -1274,7 +1217,8 @@ def get_intermediate_layers(
         """Intermediate layer accessor (NOTE: This is a WIP experiment).
         Inspired by DINO / DINOv2 interface
         """
-        # take last n blocks if n is an int, if in is a sequence, select by matching indices
+        # take last n blocks if n is an int, if in is a sequence,
+        # select by matching indices
         outputs = self._intermediate_layers(x, n)
         if norm:
             outputs = [self.norm(out) for out in outputs]
@@ -1484,10 +1428,12 @@ def __init__(
             norm_layer (nn.Module): Normalization layer.
             act_layer (nn.Module): Activation layer.
             use_abs_pos (bool): If True, use absolute positional embeddings.
-            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
-            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
-            window_size (int): Window size for window attention blocks.
-            global_attn_indexes (list): Indexes for blocks using global attention.
+            use_rel_pos (bool): If True, add relative positional embeddings to 
+            the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative 
+            positional parameters. window_size (int): Window size for window 
+            attention blocks. global_attn_indexes (list): Indexes for blocks 
+            using global attention.
             downsample_channels (list): Channels for downsampling layers.
         """
         super().__init__()
@@ -1605,7 +1551,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class Block(nn.Module):
-    """Transformer blocks with support of window attention and residual propagation blocks"""
+    """
+    Transformer blocks with support of window attention and 
+    residual propagation blocks
+    """
 
     def __init__(
         self,
@@ -1625,14 +1574,18 @@ def __init__(
             dim (int): Number of input channels.
             num_heads (int): Number of attention heads in each ViT block.
             mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            qkv_bias (bool): If True, add a learnable bias to 
+            query, key, value.
             norm_layer (nn.Module): Normalization layer.
             act_layer (nn.Module): Activation layer.
-            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
-            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
-            window_size (int): Window size for window attention blocks. If it equals 0, then
-                use global attention.
-            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+            use_rel_pos (bool): If True, add relative positional embeddings to 
+            the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative 
+            positional parameters.
+            window_size (int): Window size for window attention blocks. If 
+            it equals 0, then use global attention. input_size 
+            (tuple(int, int) or None): Input resolution for calculating 
+            the relative
                 positional parameter size.
         """
         super().__init__()
@@ -1689,10 +1642,14 @@ def __init__(
         Args:
             dim (int): Number of input channels.
             num_heads (int): Number of attention heads.
-            qkv_bias (bool):  If True, add a learnable bias to query, key, value.
-            rel_pos (bool): If True, add relative positional embeddings to the attention map.
-            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
-            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+            qkv_bias (bool):  If True, add a learnable bias to 
+            query, key, value.
+            rel_pos (bool): If True, add relative positional embeddings 
+            to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative 
+            positional parameters.
+            input_size (tuple(int, int) or None): Input resolution for 
+            calculating the relative
                 positional parameter size.
         """
         super().__init__()
@@ -1707,7 +1664,7 @@ def __init__(
         if self.use_rel_pos:
             assert (
                 input_size is not None
-            ), "Input size must be provided if using relative positional encoding."
+            ), "Input size must be provided if using relative positional encoding."  # noqa
             # initialize relative positional embeddings
             self.rel_pos_h = nn.Parameter(
                 torch.zeros(2 * input_size[0] - 1, head_dim))
@@ -1752,7 +1709,8 @@ def window_partition(x: torch.Tensor,
         window_size (int): window size.
 
     Returns:
-        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        windows: windows after partition with [B * num_windows, window_size,
+          window_size, C].
         (Hp, Wp): padded height and width before partition
     """
     B, H, W, C = x.shape
@@ -1779,7 +1737,8 @@ def window_unpartition(
     """
     Window unpartition into original sequences and removing padding.
     Args:
-        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        windows (tensor): input tokens with 
+        [B * num_windows, window_size, window_size, C].
         window_size (int): window size.
         pad_hw (Tuple): padded height and width (Hp, Wp).
         hw (Tuple): original height and width (H, W) before padding.
@@ -1845,11 +1804,10 @@ def add_decomposed_rel_pos(
 ) -> torch.Tensor:
     """
     Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
-    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
     Args:
         attn (Tensor): attention map.
         q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
-        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis
         rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
         q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
         k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
@@ -2060,7 +2018,8 @@ def feature_select(self, image_forward_outs):
         if self.select_feature == "patch":
             # if the output has cls_token
             image_features = image_features[:, 1:]
-        elif self.select_feature == "cls_patch" or self.select_feature == "same":
+        elif (self.select_feature == "cls_patch"
+              or self.select_feature == "same"):
             image_features = image_features
         else:
             raise ValueError(
@@ -2164,7 +2123,7 @@ def forward(self, images: torch.Tensor):
 
         else:
             raise ValueError(
-                "Currently only support `feature`, `sequence`, `add` and `tuple` concat type."
+                "Currently only support `feature`, `sequence`, `add` and `tuple` concat type."  # noqa
             )
 
         return images_features
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 98df9875f19e7..60d5a8a20a36c 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -6,9 +6,9 @@
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
+                                             DeepSeekMultiModalityConfig,
                                              JAISConfig, MLPSpeculatorConfig,
-                                             MPTConfig, RWConfig,
-                                             DeepSeekMultiModalityConfig)
+                                             MPTConfig, RWConfig)
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index f79de04a5ad06..7de695a7b6022 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,5 +1,7 @@
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.dbrx import DbrxConfig
+from vllm.transformers_utils.configs.deepseek_vl import (
+    DeepSeekMultiModalityConfig)
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
@@ -7,7 +9,6 @@
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
-from vllm.transformers_utils.configs.deepseek_vl import DeepSeekMultiModalityConfig
 
 __all__ = [
     "ChatGLMConfig",
diff --git a/vllm/transformers_utils/configs/deepseek_vl.py b/vllm/transformers_utils/configs/deepseek_vl.py
index b14d8cbdf5b3c..5a17a8c13b840 100644
--- a/vllm/transformers_utils/configs/deepseek_vl.py
+++ b/vllm/transformers_utils/configs/deepseek_vl.py
@@ -1,25 +1,23 @@
 # Copyright (c) 2023-2024 DeepSeek.
 #
-# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# Permission is hereby granted,free of charge, to any person obtaining a copy of
 # this software and associated documentation files (the "Software"), to deal in
 # the Software without restriction, including without limitation the rights to
-# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
-# the Software, and to permit persons to whom the Software is furnished to do so,
+# use, copy, modify, merge, publish, distribute,sublicense,and/or sell copies of
+# the Software,and to permit persons to whom the Software is furnished to do so,
 # subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# IMPLIED,INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,FITNESS
 # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-from transformers import AutoConfig
-from transformers import PretrainedConfig
-from transformers import LlamaConfig
+from transformers import AutoConfig, LlamaConfig, PretrainedConfig
 
 
 class VisionConfig(PretrainedConfig):

From 78612619a56e3f1e3d18aa00ba8b3838990c3c6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Tue, 2 Jul 2024 15:46:49 +0800
Subject: [PATCH 13/47] add test case and fix update

---
 tests/models/test_deepseek_vl.py          | 268 ++++++++++++++++++++++
 vllm/model_executor/models/deepseek_vl.py |  56 ++++-
 2 files changed, 318 insertions(+), 6 deletions(-)
 create mode 100644 tests/models/test_deepseek_vl.py

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
new file mode 100644
index 0000000000000..eb5d40d26b304
--- /dev/null
+++ b/tests/models/test_deepseek_vl.py
@@ -0,0 +1,268 @@
+from typing import List, Tuple
+
+import pytest
+
+import torch
+from transformers import AutoTokenizer
+from transformers import AutoModelForCausalLM
+
+from transformers import LlamaForCausalLM
+from transformers import AutoTokenizer
+
+from vllm.config import VisionLanguageConfig
+
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+
+from vllm.model_executor.models.deepseek_vl import (
+    model_name_to_cls,
+    MultiModalityPreTrainedModel,
+    VLMImageProcessor,
+)
+from vllm.transformers_utils.config import DeepSeekMultiModalityConfig
+
+
+
+pytestmark = pytest.mark.vlm
+
+# The image token is placed before "user" on purpose so that the test can pass
+HF_IMAGE_PROMPTS = [
+    "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: <image_placeholder>What's the content of the image?\nAssistant:",
+    "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: <image_placeholder>What is the season?\nAssistant:",
+]
+
+
+
+class MultiModalityCausalLM(MultiModalityPreTrainedModel):
+    def __init__(self, config: DeepSeekMultiModalityConfig):
+        super().__init__(config)
+
+        vision_config = config.vision_config
+        vision_cls = model_name_to_cls(vision_config.cls)
+        self.vision_model = vision_cls(**vision_config.params)
+
+        aligner_config = config.aligner_config
+        aligner_cls = model_name_to_cls(aligner_config.cls)
+        self.aligner = aligner_cls(aligner_config.params)
+
+        language_config = config.language_config
+        self.language_model = LlamaForCausalLM(language_config)
+
+    def prepare_inputs_embeds(
+        self,
+        input_ids: torch.LongTensor,
+        pixel_values: torch.FloatTensor,
+        images_seq_mask: torch.LongTensor,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            input_ids (torch.LongTensor): [b, T]
+            pixel_values (torch.FloatTensor):   [b, n_images, 3, h, w]
+            images_seq_mask (torch.BoolTensor): [b, T]
+
+            assert torch.sum(images_seq_mask) == torch.sum(images_emb_mask)
+
+        Returns:
+            input_embeds (torch.Tensor): [b, T, D]
+        """
+
+        bs, n = pixel_values.shape[0:2]
+        p_b, p_n, p_c, p_h, p_w = pixel_values.shape
+        images = pixel_values.reshape(p_b * p_n, p_c, p_h, p_w)
+        images_embeds = self.aligner(self.vision_model(images))
+
+        # [b x n, T2, D] -> [b, n x T2, D]
+        _, t, d = images_embeds.shape
+        images_embeds = images_embeds.reshape(bs, n * t, d)
+
+        # [b, T, D]
+        input_ids[input_ids < 0] = 0  # ignore the image embeddings
+        inputs_embeds = self.language_model.get_input_embeddings()(
+            input_ids
+        ).reshape(1, -1, 4096)
+
+        # replace with the image embeddings
+        images_embeds = images_embeds.reshape(
+            1, -1, self.config.aligner_config.params["n_embed"]
+        )
+        inputs_embeds[images_seq_mask] = images_embeds
+
+        return inputs_embeds
+
+
+def get_input(tokenizer, prompt, image):
+
+    image_id = 100015
+    vl_image = VLMImageProcessor(1024)
+    input_ids = tokenizer.encode(prompt)
+    input_ids = torch.LongTensor(input_ids)
+    image_token_mask = input_ids == image_id
+    images_outputs = vl_image(image, return_tensors="pt")
+    images_emb_mask = torch.ones(1, 1, 576) == 1
+    prepare = {
+        "sft_format": prompt,
+        "input_ids": input_ids.to("cuda"),
+        "pixel_values": images_outputs.pixel_values.to(torch.bfloat16)
+        .to("cuda")
+        .reshape(1, -1, 3, 1024, 1024),
+        "num_image_tokens": 576,
+        "images_seq_mask": image_token_mask.to("cuda").reshape(1, -1),
+        "images_emb_mask": images_emb_mask.to("cuda"),
+        "attention_mask": torch.ones(1, len(input_ids)).to("cuda"),
+    }
+    return prepare
+
+
+def iter_llava_configs(model_name: str):
+    image_hw_to_feature_size = {
+        (1024, 1024): 576,
+    }
+
+    for (h, w), f in image_hw_to_feature_size.items():
+        for input_type, input_shape in [
+            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
+            (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
+        ]:
+            yield (
+                model_name,
+                VisionLanguageConfig(
+                    image_input_type=input_type,
+                    image_feature_size=f,
+                    image_token_id=100015,
+                    image_input_shape=input_shape,
+                    image_processor=model_name,
+                    image_processor_revision=None,
+                ),
+            )
+
+
+model_and_vl_config = [
+    *iter_llava_configs("deepseek-ai/deepseek-vl-7b-chat"),
+]
+
+
+def vllm_to_hf_output(
+    vllm_output: Tuple[List[int], str],
+    vlm_config: VisionLanguageConfig,
+    model_id: str,
+):
+    """Sanitize vllm output to be comparable with hf output.
+    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
+    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
+    It also reduces `output_str` from "<image_placeholder><image_placeholder>bla" to "bla".
+    """
+    input_ids, output_str = vllm_output
+    image_token_id = vlm_config.image_token_id
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    image_token_str = tokenizer.decode(image_token_id)
+
+    hf_input_ids = [
+        input_id
+        for idx, input_id in enumerate(input_ids)
+        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
+    ]
+    hf_output_str = output_str.replace(
+        image_token_str * vlm_config.image_feature_size, ""
+    )
+
+    return hf_input_ids, hf_output_str
+
+
+# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    hf_images,
+    vllm_images,
+    model_and_config,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalData objects and corresponding
+    vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    model_id, vlm_config = model_and_config
+
+    vllm_image_prompts = [
+        p.replace(
+            "<image_placeholder>",
+            "<image_placeholder>" * vlm_config.image_feature_size,
+        )
+        for p in HF_IMAGE_PROMPTS
+    ]
+
+    with vllm_runner(
+        model_id,
+        dtype=dtype,
+        enforce_eager=True,
+        **vlm_config.as_cli_args_dict(),
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(
+            vllm_image_prompts, max_tokens, images=vllm_images
+        )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    AutoModelForCausalLM.register(
+        DeepSeekMultiModalityConfig, MultiModalityCausalLM
+    )
+    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
+        prepare_input_one = get_input(
+            tokenizer,
+            HF_IMAGE_PROMPTS[0].replace(
+                "<image_placeholder>", "<image_placeholder>" * 576
+            ),
+            hf_images,
+        )
+        prepare_input_two = get_input(
+            tokenizer,
+            HF_IMAGE_PROMPTS[1].replace(
+                "<image_placeholder>", "<image_placeholder>" * 576
+            ),
+            hf_images,
+        )
+        prepare_input_one = hf_model.prepare_inputs_embeds(**prepare_input_one)
+        prepare_input_two = hf_model.prepare_inputs_embeds(**prepare_input_two)
+        prepare_input = torch.concat(prepare_input_one, prepare_input_two)
+        attention_mask = torch.concat(
+            prepare_input_one["attention_mask"],
+            prepare_input_two["attention_mask"],
+        )
+        hf_outputs = hf_model.generate_greedy(
+            HF_IMAGE_PROMPTS,
+            max_tokens,
+            images=hf_images,
+            inputs_embeds=prepare_input,
+            attention_mask=attention_mask,
+            pad_token_id=tokenizer.eos_token_id,
+            bos_token_id=tokenizer.bos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            do_sample=False,
+            use_cache=True,
+        )
+
+    
+
+    
+
+    for i in range(len(HF_IMAGE_PROMPTS)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+            vllm_outputs[i], vlm_config, model_id
+        )
+        assert (
+            hf_output_str == vllm_output_str
+        ), f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}"
+        assert (
+            hf_output_ids == vllm_output_ids
+        ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}"
diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py
index 27fd994be94d2..16e6a10c95530 100644
--- a/vllm/model_executor/models/deepseek_vl.py
+++ b/vllm/model_executor/models/deepseek_vl.py
@@ -44,7 +44,7 @@
 from transformers.image_utils import to_numpy_array
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -53,13 +53,12 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import get_dummy_image_data
-from vllm.sequence import SamplerOutput
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
+from vllm.multimodal.base import VisionLanguageModelBase
+from vllm.multimodal.image import ImageFeatureData, ImagePixelData
+from vllm.sequence import SamplerOutput, SequenceData
 from vllm.transformers_utils.configs import DeepSeekMultiModalityConfig
 
-from .vlm_base import VisionLanguageModelBase
-
 ImageType = Union[np.ndarray, torch.Tensor, Image.Image]
 IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
 IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
@@ -68,6 +67,51 @@
 LayerType = Union[str, Callable, Type[torch.nn.Module]]
 
 
+def _get_dummy_seq_data(seq_len: int,
+                        vlm_config: VisionLanguageConfig) -> SequenceData:
+    # NOTE: We assume that <image> token is repeated `image_feature_size` times
+    # and then concatenated with the text prompt
+    # TODO: Enable other ways of inserting the image into the prompt
+
+    token_ids = [vlm_config.image_token_id] * vlm_config.image_feature_size
+    token_ids += [0] * (seq_len - vlm_config.image_feature_size)
+
+    return SequenceData(token_ids)
+
+
+def _get_dummy_values(vlm_config: VisionLanguageConfig) -> torch.Tensor:
+    if vlm_config.image_processor is None:
+        values_dtype = torch.float16
+    else:
+        values_dtype = torch.uint8
+
+    return torch.zeros(vlm_config.image_input_shape, dtype=values_dtype)
+
+
+def get_dummy_image_data(
+    seq_len: int,
+    model_config: ModelConfig,
+    vlm_config: VisionLanguageConfig,
+) -> Tuple[SequenceData, MultiModalData]:
+    """Standard dummy data factory for image data (to be used in
+    :meth:`vlm.multimodal.MultiModalRegistry.register_dummy_data`)."""
+    seq_data = _get_dummy_seq_data(seq_len, vlm_config)
+    values = _get_dummy_values(vlm_config)
+
+    config_input_type = vlm_config.image_input_type
+    ImageInputType = VisionLanguageConfig.ImageInputType
+
+    fake_mm_data: MultiModalData
+    if config_input_type == ImageInputType.PIXEL_VALUES:
+        fake_mm_data = ImagePixelData(values)
+    elif config_input_type == ImageInputType.IMAGE_FEATURES:
+        fake_mm_data = ImageFeatureData(values)
+    else:
+        raise NotImplementedError
+
+    return seq_data, fake_mm_data
+
+
 # From PyTorch internals
 def _ntuple(n):
 

From 9b2e11677383c834b57d679089b1523d982e853c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Tue, 2 Jul 2024 16:08:47 +0800
Subject: [PATCH 14/47] uodate test case

---
 tests/models/test_deepseek_vl.py | 137 ++++++++++++++++++-------------
 1 file changed, 81 insertions(+), 56 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index eb5d40d26b304..c585ede7d8b1f 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from typing import List, Optional, Tuple, Type
 
 import pytest
 
@@ -21,7 +21,6 @@
 from vllm.transformers_utils.config import DeepSeekMultiModalityConfig
 
 
-
 pytestmark = pytest.mark.vlm
 
 # The image token is placed before "user" on purpose so that the test can pass
@@ -31,7 +30,6 @@
 ]
 
 
-
 class MultiModalityCausalLM(MultiModalityPreTrainedModel):
     def __init__(self, config: DeepSeekMultiModalityConfig):
         super().__init__(config)
@@ -114,7 +112,7 @@ def get_input(tokenizer, prompt, image):
     return prepare
 
 
-def iter_llava_configs(model_name: str):
+def iter_deepseek_vl_configs(model_name: str):
     image_hw_to_feature_size = {
         (1024, 1024): 576,
     }
@@ -138,7 +136,7 @@ def iter_llava_configs(model_name: str):
 
 
 model_and_vl_config = [
-    *iter_llava_configs("deepseek-ai/deepseek-vl-7b-chat"),
+    *iter_deepseek_vl_configs("deepseek-ai/deepseek-vl-7b-chat"),
 ]
 
 
@@ -174,14 +172,16 @@ def vllm_to_hf_output(
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    hf_images,
-    vllm_images,
-    model_and_config,
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model_and_config: Tuple[str, VisionLanguageConfig],
+    *,
     dtype: str,
     max_tokens: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
 ) -> None:
     """Inference result should be the same between hf and vllm.
 
@@ -193,6 +193,7 @@ def test_models(
     The text output is sanitized to be able to compare with hf.
     """
     model_id, vlm_config = model_and_config
+    hf_images = [asset.for_hf() for asset in image_assets]
 
     vllm_image_prompts = [
         p.replace(
@@ -202,59 +203,61 @@ def test_models(
         for p in HF_IMAGE_PROMPTS
     ]
 
-    with vllm_runner(
-        model_id,
-        dtype=dtype,
-        enforce_eager=True,
-        **vlm_config.as_cli_args_dict(),
-    ) as vllm_model:
+    with vllm_runner(model_id,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True,
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
+        vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
         vllm_outputs = vllm_model.generate_greedy(
             vllm_image_prompts, max_tokens, images=vllm_images
         )
-
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
     AutoModelForCausalLM.register(
         DeepSeekMultiModalityConfig, MultiModalityCausalLM
     )
-    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        prepare_input_one = get_input(
-            tokenizer,
-            HF_IMAGE_PROMPTS[0].replace(
-                "<image_placeholder>", "<image_placeholder>" * 576
-            ),
-            hf_images,
-        )
-        prepare_input_two = get_input(
-            tokenizer,
-            HF_IMAGE_PROMPTS[1].replace(
-                "<image_placeholder>", "<image_placeholder>" * 576
-            ),
-            hf_images,
-        )
-        prepare_input_one = hf_model.prepare_inputs_embeds(**prepare_input_one)
-        prepare_input_two = hf_model.prepare_inputs_embeds(**prepare_input_two)
-        prepare_input = torch.concat(prepare_input_one, prepare_input_two)
-        attention_mask = torch.concat(
-            prepare_input_one["attention_mask"],
-            prepare_input_two["attention_mask"],
-        )
-        hf_outputs = hf_model.generate_greedy(
-            HF_IMAGE_PROMPTS,
-            max_tokens,
-            images=hf_images,
-            inputs_embeds=prepare_input,
-            attention_mask=attention_mask,
-            pad_token_id=tokenizer.eos_token_id,
-            bos_token_id=tokenizer.bos_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-            do_sample=False,
-            use_cache=True,
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    hf_model = AutoModelForCausalLM.from_pretrained(
+        model_id, trust_remote_code=True
+    )
+    hf_model = hf_model.to("cuda").eval()
+    prepare_input_one = get_input(
+        tokenizer,
+        HF_IMAGE_PROMPTS[0].replace(
+            "<image_placeholder>", "<image_placeholder>" * 576
+        ),
+        hf_images,
+    )
+    prepare_input_two = get_input(
+        tokenizer,
+        HF_IMAGE_PROMPTS[1].replace(
+            "<image_placeholder>", "<image_placeholder>" * 576
+        ),
+        hf_images,
+    )
+    prepare_input_one = hf_model.prepare_inputs_embeds(**prepare_input_one)
+    prepare_input_two = hf_model.prepare_inputs_embeds(**prepare_input_two)
+    prepare_input = torch.concat(prepare_input_one, prepare_input_two)
+    attention_mask = torch.concat(
+        prepare_input_one["attention_mask"],
+        prepare_input_two["attention_mask"],
+    )
+    outputs = hf_model.generate(
+        inputs_embeds=prepare_input,
+        attention_mask=attention_mask,
+        max_new_tokens=max_tokens,
+        pad_token_id=tokenizer.eos_token_id,
+        bos_token_id=tokenizer.bos_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        do_sample=False,
+        use_cache=True,
+    )
+    hf_outputs = []
+    for o in outputs:
+        hf_outputs.append(
+            o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True)
         )
 
-    
-
-    
-
     for i in range(len(HF_IMAGE_PROMPTS)):
         hf_output_ids, hf_output_str = hf_outputs[i]
         vllm_output_ids, vllm_output_str = vllm_to_hf_output(
@@ -266,3 +269,25 @@ def test_models(
         assert (
             hf_output_ids == vllm_output_ids
         ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}"
+
+
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model_and_config,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model_and_config,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        tensor_parallel_size=1,
+    )

From 80533ea6b16e6114b6be44aae6cef1cf8084af5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Tue, 2 Jul 2024 16:17:35 +0800
Subject: [PATCH 15/47] uodate test case and fix test bugs

---
 tests/models/test_deepseek_vl.py | 122 +++++++++++++++----------------
 1 file changed, 58 insertions(+), 64 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index c585ede7d8b1f..953aa365a71af 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -1,36 +1,33 @@
 from typing import List, Optional, Tuple, Type
 
 import pytest
-
 import torch
-from transformers import AutoTokenizer
-from transformers import AutoModelForCausalLM
-
-from transformers import LlamaForCausalLM
-from transformers import AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
 
 from vllm.config import VisionLanguageConfig
-
-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-
 from vllm.model_executor.models.deepseek_vl import (
-    model_name_to_cls,
-    MultiModalityPreTrainedModel,
-    VLMImageProcessor,
-)
+    MultiModalityPreTrainedModel, VLMImageProcessor, model_name_to_cls)
 from vllm.transformers_utils.config import DeepSeekMultiModalityConfig
 
+from ..conftest import HfRunner, VllmRunner, _ImageAssets
 
 pytestmark = pytest.mark.vlm
 
 # The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = [
-    "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: <image_placeholder>What's the content of the image?\nAssistant:",
-    "You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n User: <image_placeholder>What is the season?\nAssistant:",
+    "You are a helpful language and vision assistant. You are able" \
+      "to understand the visual content that the user provides, and assist " \
+      "the user with a variety of tasks using natural language.\n User:" \
+      " <image_placeholder>What's the content of the image?\nAssistant:",
+    "You are a helpful language and vision assistant. You are able to "\
+    "understand the visual content that the user provides, and assist the "\
+    "user with a variety of tasks using natural language.\n User: "\
+    "<image_placeholder>What is the season?\nAssistant:",
 ]
 
 
 class MultiModalityCausalLM(MultiModalityPreTrainedModel):
+
     def __init__(self, config: DeepSeekMultiModalityConfig):
         super().__init__(config)
 
@@ -77,13 +74,11 @@ def prepare_inputs_embeds(
         # [b, T, D]
         input_ids[input_ids < 0] = 0  # ignore the image embeddings
         inputs_embeds = self.language_model.get_input_embeddings()(
-            input_ids
-        ).reshape(1, -1, 4096)
+            input_ids).reshape(1, -1, 4096)
 
         # replace with the image embeddings
         images_embeds = images_embeds.reshape(
-            1, -1, self.config.aligner_config.params["n_embed"]
-        )
+            1, -1, self.config.aligner_config.params["n_embed"])
         inputs_embeds[images_seq_mask] = images_embeds
 
         return inputs_embeds
@@ -99,15 +94,21 @@ def get_input(tokenizer, prompt, image):
     images_outputs = vl_image(image, return_tensors="pt")
     images_emb_mask = torch.ones(1, 1, 576) == 1
     prepare = {
-        "sft_format": prompt,
-        "input_ids": input_ids.to("cuda"),
-        "pixel_values": images_outputs.pixel_values.to(torch.bfloat16)
-        .to("cuda")
-        .reshape(1, -1, 3, 1024, 1024),
-        "num_image_tokens": 576,
-        "images_seq_mask": image_token_mask.to("cuda").reshape(1, -1),
-        "images_emb_mask": images_emb_mask.to("cuda"),
-        "attention_mask": torch.ones(1, len(input_ids)).to("cuda"),
+        "sft_format":
+        prompt,
+        "input_ids":
+        input_ids.to("cuda"),
+        "pixel_values":
+        images_outputs.pixel_values.to(torch.bfloat16).to("cuda").reshape(
+            1, -1, 3, 1024, 1024),
+        "num_image_tokens":
+        576,
+        "images_seq_mask":
+        image_token_mask.to("cuda").reshape(1, -1),
+        "images_emb_mask":
+        images_emb_mask.to("cuda"),
+        "attention_mask":
+        torch.ones(1, len(input_ids)).to("cuda"),
     }
     return prepare
 
@@ -148,7 +149,8 @@ def vllm_to_hf_output(
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
-    It also reduces `output_str` from "<image_placeholder><image_placeholder>bla" to "bla".
+    It also reduces `output_str` from 
+    "<image_placeholder><image_placeholder>bla" to "bla".
     """
     input_ids, output_str = vllm_output
     image_token_id = vlm_config.image_token_id
@@ -157,13 +159,11 @@ def vllm_to_hf_output(
     image_token_str = tokenizer.decode(image_token_id)
 
     hf_input_ids = [
-        input_id
-        for idx, input_id in enumerate(input_ids)
+        input_id for idx, input_id in enumerate(input_ids)
         if input_id != image_token_id or input_ids[idx - 1] != image_token_id
     ]
     hf_output_str = output_str.replace(
-        image_token_str * vlm_config.image_feature_size, ""
-    )
+        image_token_str * vlm_config.image_feature_size, "")
 
     return hf_input_ids, hf_output_str
 
@@ -199,40 +199,37 @@ def run_test(
         p.replace(
             "<image_placeholder>",
             "<image_placeholder>" * vlm_config.image_feature_size,
-        )
-        for p in HF_IMAGE_PROMPTS
+        ) for p in HF_IMAGE_PROMPTS
     ]
 
-    with vllm_runner(model_id,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True,
-                     **vlm_config.as_cli_args_dict()) as vllm_model:
+    with vllm_runner(
+            model_id,
+            dtype=dtype,
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            enforce_eager=True,
+            **vlm_config.as_cli_args_dict(),
+    ) as vllm_model:
         vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
-        vllm_outputs = vllm_model.generate_greedy(
-            vllm_image_prompts, max_tokens, images=vllm_images
-        )
-    AutoModelForCausalLM.register(
-        DeepSeekMultiModalityConfig, MultiModalityCausalLM
-    )
+        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+                                                  max_tokens,
+                                                  images=vllm_images)
+    AutoModelForCausalLM.register(DeepSeekMultiModalityConfig,
+                                  MultiModalityCausalLM)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
-    hf_model = AutoModelForCausalLM.from_pretrained(
-        model_id, trust_remote_code=True
-    )
+    hf_model = AutoModelForCausalLM.from_pretrained(model_id,
+                                                    trust_remote_code=True)
     hf_model = hf_model.to("cuda").eval()
     prepare_input_one = get_input(
         tokenizer,
-        HF_IMAGE_PROMPTS[0].replace(
-            "<image_placeholder>", "<image_placeholder>" * 576
-        ),
+        HF_IMAGE_PROMPTS[0].replace("<image_placeholder>",
+                                    "<image_placeholder>" * 576),
         hf_images,
     )
     prepare_input_two = get_input(
         tokenizer,
-        HF_IMAGE_PROMPTS[1].replace(
-            "<image_placeholder>", "<image_placeholder>" * 576
-        ),
+        HF_IMAGE_PROMPTS[1].replace("<image_placeholder>",
+                                    "<image_placeholder>" * 576),
         hf_images,
     )
     prepare_input_one = hf_model.prepare_inputs_embeds(**prepare_input_one)
@@ -252,23 +249,20 @@ def run_test(
         do_sample=False,
         use_cache=True,
     )
-    hf_outputs = []
+    hf_outputs: List = []
     for o in outputs:
         hf_outputs.append(
-            o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True)
-        )
+            (o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True)))
 
     for i in range(len(HF_IMAGE_PROMPTS)):
         hf_output_ids, hf_output_str = hf_outputs[i]
         vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id
-        )
+            vllm_outputs[i], vlm_config, model_id)
         assert (
             hf_output_str == vllm_output_str
         ), f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}"
-        assert (
-            hf_output_ids == vllm_output_ids
-        ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}"
+        assert (hf_output_ids == vllm_output_ids
+                ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}"
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)

From 92c80841e7acfef35728af1cf0b03fb6087a93a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Wed, 3 Jul 2024 10:20:20 +0800
Subject: [PATCH 16/47] update test case and fix bugs

---
 tests/models/test_deepseek_vl.py          |  8 +--
 vllm/model_executor/models/deepseek_vl.py | 81 ++++++++---------------
 2 files changed, 29 insertions(+), 60 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 953aa365a71af..1c007f5772e1b 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -210,7 +210,7 @@ def run_test(
             enforce_eager=True,
             **vlm_config.as_cli_args_dict(),
     ) as vllm_model:
-        vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
+        vllm_images = [asset.for_vllm() for asset in image_assets]
         vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
                                                   max_tokens,
                                                   images=vllm_images)
@@ -222,14 +222,12 @@ def run_test(
     hf_model = hf_model.to("cuda").eval()
     prepare_input_one = get_input(
         tokenizer,
-        HF_IMAGE_PROMPTS[0].replace("<image_placeholder>",
-                                    "<image_placeholder>" * 576),
+        vllm_image_prompts[0],
         hf_images,
     )
     prepare_input_two = get_input(
         tokenizer,
-        HF_IMAGE_PROMPTS[1].replace("<image_placeholder>",
-                                    "<image_placeholder>" * 576),
+        vllm_image_prompts[1],
         hf_images,
     )
     prepare_input_one = hf_model.prepare_inputs_embeds(**prepare_input_one)
diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py
index 16e6a10c95530..867202ca088af 100644
--- a/vllm/model_executor/models/deepseek_vl.py
+++ b/vllm/model_executor/models/deepseek_vl.py
@@ -44,7 +44,8 @@
 from transformers.image_utils import to_numpy_array
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -53,12 +54,13 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
-from vllm.multimodal.base import VisionLanguageModelBase
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
-from vllm.sequence import SamplerOutput, SequenceData
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.sequence import SamplerOutput
 from vllm.transformers_utils.configs import DeepSeekMultiModalityConfig
 
+from .clip import dummy_seq_data_for_clip
+from .interfaces import SupportsVision
+
 ImageType = Union[np.ndarray, torch.Tensor, Image.Image]
 IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
 IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
@@ -67,51 +69,6 @@
 LayerType = Union[str, Callable, Type[torch.nn.Module]]
 
 
-def _get_dummy_seq_data(seq_len: int,
-                        vlm_config: VisionLanguageConfig) -> SequenceData:
-    # NOTE: We assume that <image> token is repeated `image_feature_size` times
-    # and then concatenated with the text prompt
-    # TODO: Enable other ways of inserting the image into the prompt
-
-    token_ids = [vlm_config.image_token_id] * vlm_config.image_feature_size
-    token_ids += [0] * (seq_len - vlm_config.image_feature_size)
-
-    return SequenceData(token_ids)
-
-
-def _get_dummy_values(vlm_config: VisionLanguageConfig) -> torch.Tensor:
-    if vlm_config.image_processor is None:
-        values_dtype = torch.float16
-    else:
-        values_dtype = torch.uint8
-
-    return torch.zeros(vlm_config.image_input_shape, dtype=values_dtype)
-
-
-def get_dummy_image_data(
-    seq_len: int,
-    model_config: ModelConfig,
-    vlm_config: VisionLanguageConfig,
-) -> Tuple[SequenceData, MultiModalData]:
-    """Standard dummy data factory for image data (to be used in
-    :meth:`vlm.multimodal.MultiModalRegistry.register_dummy_data`)."""
-    seq_data = _get_dummy_seq_data(seq_len, vlm_config)
-    values = _get_dummy_values(vlm_config)
-
-    config_input_type = vlm_config.image_input_type
-    ImageInputType = VisionLanguageConfig.ImageInputType
-
-    fake_mm_data: MultiModalData
-    if config_input_type == ImageInputType.PIXEL_VALUES:
-        fake_mm_data = ImagePixelData(values)
-    elif config_input_type == ImageInputType.IMAGE_FEATURES:
-        fake_mm_data = ImageFeatureData(values)
-    else:
-        raise NotImplementedError
-
-    return seq_data, fake_mm_data
-
-
 # From PyTorch internals
 def _ntuple(n):
 
@@ -2196,10 +2153,24 @@ class MultiModalityPreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = "past_key_values"
 
 
-@MULTIMODAL_REGISTRY.register_image_feature_input()
-@MULTIMODAL_REGISTRY.register_image_pixel_input()
-@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
-class DeepSeekMultiModalityCausalLM(VisionLanguageModelBase):
+def dummy_data_for_deepseek(ctx: InputContext, seq_len: int):
+    hf_config = ctx.get_hf_config(DeepSeekMultiModalityConfig)
+    vision_config = hf_config.vision_config
+    image_size = vision_config.params.get("image_size")
+    if not image_size:
+        # Get image size for 7b model
+        image_size = vision_config.params["high_res_cfg"]["image_size"]
+    seq_data = dummy_seq_data_for_clip(vision_config,
+                                       seq_len,
+                                       image_token_id=100015,
+                                       image_feature_size_override=576)
+    mm_data = Image.new("RGB", (image_size, image_size), color=0)
+    return seq_data, mm_data
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_deepseek)
+class DeepSeekMultiModalityCausalLM(nn.Module, SupportsVision):
 
     def __init__(
         self,
@@ -2208,7 +2179,7 @@ def __init__(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ):
-        super().__init__(config, )
+        super().__init__()
         self.config = config
         vision_config = config.vision_config
         aligner_config = config.aligner_config

From d6452bb7967d55fbfa1a35fc5d8f93f4a220f0de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Wed, 3 Jul 2024 10:28:10 +0800
Subject: [PATCH 17/47] update test example

---
 examples/deepseek_vl_example.py | 68 ++++-----------------------------
 1 file changed, 8 insertions(+), 60 deletions(-)

diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py
index 715464635d599..43d7fe55b6a01 100644
--- a/examples/deepseek_vl_example.py
+++ b/examples/deepseek_vl_example.py
@@ -1,18 +1,13 @@
-import argparse
 import os
 import subprocess
 
-import torch
 from PIL import Image
 
 # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
 # You can use `.buildkite/download-images.sh` to download them
 from vllm import LLM, SamplingParams
-from vllm.model_executor.models.deepseek_vl import VLMImageProcessor
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 
 sample_params = SamplingParams(temperature=0, max_tokens=1024)
-
 model = "deepseek-ai/deepseek-vl-7b-chat"
 prompt = "You are a helpful language and vision assistant." \
     "You are able to understand the visual content that the user provides," \
@@ -22,7 +17,7 @@
 prompt = prompt.replace("<image_placeholder>", "<image_placeholder>" * 576)
 
 
-def run_deepseek_vl_pixel_values(*, disable_image_processor: bool = False):
+def run_deepseek_vl(*, disable_image_processor: bool = False):
     llm = LLM(
         model=model,
         image_input_type="pixel_values",
@@ -35,15 +30,14 @@ def run_deepseek_vl_pixel_values(*, disable_image_processor: bool = False):
         enforce_eager=True,
     )
 
-    if disable_image_processor:
-        image = get_image_features()
-    else:
-        image = Image.open("images/stop_sign.jpg")
+    image = Image.open("images/stop_sign.jpg")
 
     outputs = llm.generate(
         {
             "prompt": prompt,
-            "multi_modal_data": ImagePixelData(image),
+            "multi_modal_data": {
+                "image": image
+            },
         },
         sampling_params=sample_params,
     )
@@ -53,57 +47,11 @@ def run_deepseek_vl_pixel_values(*, disable_image_processor: bool = False):
         print(generated_text)
 
 
-def run_deepseek_vl_image_features():
-    llm = LLM(
-        model=model,
-        image_input_type="image_features",
-        image_token_id=100015,
-        image_input_shape="1,3,1024,1024",
-        image_feature_size=576,
-        gpu_memory_utilization=0.9,
-        max_model_len=3072,
-        enforce_eager=True,
-    )
-
-    image: torch.Tensor = get_image_features()
-
-    outputs = llm.generate(
-        {
-            "prompt": prompt,
-            "multi_modal_data": ImageFeatureData(image),
-        },
-        sampling_params=sample_params,
-    )
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-
-def get_image_features():
-    image_feature = VLMImageProcessor(1024)(
-        Image.open("images/stop_sign.jpg"))["pixel_values"]
-    torch.save(image_feature, "images/deepseek_vl_stop_sign.pt")
-    return torch.load("images/deepseek_vl_stop_sign.pt")
-
-
-def main(args):
-    if args.type == "pixel_values":
-        run_deepseek_vl_pixel_values()
-    else:
-        run_deepseek_vl_image_features()
+def main():
+    run_deepseek_vl()
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Demo on deepseek-vl")
-    parser.add_argument(
-        "--type",
-        type=str,
-        choices=["pixel_values", "image_features"],
-        default="pixel_values",
-        help="image input type",
-    )
-    args = parser.parse_args()
     # Download from s3
     s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
     local_directory = "images"
@@ -120,4 +68,4 @@ def main(args):
         local_directory,
         "--no-sign-request",
     ])
-    main(args)
+    main()

From f63a1a1a0e205a3bed5d3aab91c620f83e8b03b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Wed, 3 Jul 2024 11:36:13 +0800
Subject: [PATCH 18/47] fix test case

---
 tests/models/test_deepseek_vl.py | 41 +++++++++++++-------------------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 1c007f5772e1b..f8b75d5eb5825 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -10,6 +10,7 @@
 from vllm.transformers_utils.config import DeepSeekMultiModalityConfig
 
 from ..conftest import HfRunner, VllmRunner, _ImageAssets
+from .utils import check_outputs_equal
 
 pytestmark = pytest.mark.vlm
 
@@ -115,25 +116,15 @@ def get_input(tokenizer, prompt, image):
 
 def iter_deepseek_vl_configs(model_name: str):
     image_hw_to_feature_size = {
-        (1024, 1024): 576,
+        (336, 336): 576,
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
-        for input_type, input_shape in [
-            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-            (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
-        ]:
-            yield (
-                model_name,
-                VisionLanguageConfig(
-                    image_input_type=input_type,
-                    image_feature_size=f,
-                    image_token_id=100015,
-                    image_input_shape=input_shape,
-                    image_processor=model_name,
-                    image_processor_revision=None,
-                ),
-            )
+        input_shape = (1, 3, h, w)
+        yield (model_name,
+               VisionLanguageConfig(image_feature_size=f,
+                                    image_token_id=100015,
+                                    image_input_shape=input_shape))
 
 
 model_and_vl_config = [
@@ -252,15 +243,15 @@ def run_test(
         hf_outputs.append(
             (o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True)))
 
-    for i in range(len(HF_IMAGE_PROMPTS)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id)
-        assert (
-            hf_output_str == vllm_output_str
-        ), f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}"
-        assert (hf_output_ids == vllm_output_ids
-                ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}"
+    check_outputs_equal(
+        hf_outputs,
+        [
+            vllm_to_hf_output(vllm_output, vlm_config, model_id)
+            for vllm_output in vllm_outputs
+        ],
+        name_0="hf",
+        name_1="vllm",
+    )
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)

From 1b90f47da0b83ebb7cc381b72fbd948645ae0ed4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Wed, 3 Jul 2024 18:07:39 +0800
Subject: [PATCH 19/47] Adaptation code update

---
 examples/deepseek_vl_example.py           | 14 +++++++-------
 vllm/model_executor/models/deepseek_vl.py | 17 ++++++++---------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py
index 43d7fe55b6a01..b1d18ceb2af65 100644
--- a/examples/deepseek_vl_example.py
+++ b/examples/deepseek_vl_example.py
@@ -8,23 +8,23 @@
 from vllm import LLM, SamplingParams
 
 sample_params = SamplingParams(temperature=0, max_tokens=1024)
-model = "deepseek-ai/deepseek-vl-7b-chat"
-prompt = "You are a helpful language and vision assistant." \
-    "You are able to understand the visual content that the user provides," \
-    "and assist the user with a variety of tasks using natural language.\n" \
+model = "/pretrained_models/deepseek-vl-7b-chat"
+prompt = (
+    "You are a helpful language and vision assistant."
+    "You are able to understand the visual content that the user provides,"
+    "and assist the user with a variety of tasks using natural language.\n"
     "User: <image_placeholder> Describe the content of this image.\nAssistant:"
+)
 
 prompt = prompt.replace("<image_placeholder>", "<image_placeholder>" * 576)
 
 
-def run_deepseek_vl(*, disable_image_processor: bool = False):
+def run_deepseek_vl():
     llm = LLM(
         model=model,
-        image_input_type="pixel_values",
         image_token_id=100015,
         image_input_shape="1,3,1024,1024",
         image_feature_size=576,
-        disable_image_processor=False,
         gpu_memory_utilization=0.9,
         max_model_len=3072,
         enforce_eager=True,
diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py
index 867202ca088af..3e10f2c0ee4db 100644
--- a/vllm/model_executor/models/deepseek_vl.py
+++ b/vllm/model_executor/models/deepseek_vl.py
@@ -2164,7 +2164,7 @@ def dummy_data_for_deepseek(ctx: InputContext, seq_len: int):
                                        seq_len,
                                        image_token_id=100015,
                                        image_feature_size_override=576)
-    mm_data = Image.new("RGB", (image_size, image_size), color=0)
+    mm_data = {"image": Image.new("RGB", (image_size, image_size), color=0)}
     return seq_data, mm_data
 
 
@@ -2172,13 +2172,11 @@ def dummy_data_for_deepseek(ctx: InputContext, seq_len: int):
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_deepseek)
 class DeepSeekMultiModalityCausalLM(nn.Module, SupportsVision):
 
-    def __init__(
-        self,
-        config: DeepSeekMultiModalityConfig,
-        vision_language_config: VisionLanguageConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
+    def __init__(self,
+                 config: DeepSeekMultiModalityConfig,
+                 vlm_config: VisionLanguageConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
         self.config = config
         vision_config = config.vision_config
@@ -2284,6 +2282,7 @@ def forward(
             positions,
             kv_caches,
             attn_metadata,
+            None,
             inputs_embeds=inputs_embeds,
         )
 
@@ -2291,7 +2290,7 @@ def forward(
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+        logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
         return logits
 

From 026c92fbd2d19f332eaa4d4d825dea2bace66fa7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Thu, 4 Jul 2024 15:37:16 +0800
Subject: [PATCH 20/47] Adaptation code update

---
 examples/deepseek_vl_example.py           |  30 +++----
 tests/models/test_deepseek_vl.py          |   4 +-
 vllm/model_executor/models/deepseek_vl.py | 103 ++++++++++++++--------
 3 files changed, 80 insertions(+), 57 deletions(-)

diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py
index b1d18ceb2af65..84be2ebd5b397 100644
--- a/examples/deepseek_vl_example.py
+++ b/examples/deepseek_vl_example.py
@@ -1,6 +1,7 @@
 import os
 import subprocess
 
+import torch
 from PIL import Image
 
 # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
@@ -8,27 +9,19 @@
 from vllm import LLM, SamplingParams
 
 sample_params = SamplingParams(temperature=0, max_tokens=1024)
-model = "/pretrained_models/deepseek-vl-7b-chat"
-prompt = (
-    "You are a helpful language and vision assistant."
-    "You are able to understand the visual content that the user provides,"
-    "and assist the user with a variety of tasks using natural language.\n"
+model = "deepseek-ai/deepseek-vl-7b-chat"
+model = "deepseek-ai/deepseek-vl-1.3b-chat"
+prompt = "You are a helpful language and vision assistant." \
+    "You are able to understand the visual content that the user provides," \
+    "and assist the user with a variety of tasks using natural language.\n" \
     "User: <image_placeholder> Describe the content of this image.\nAssistant:"
-)
-
-prompt = prompt.replace("<image_placeholder>", "<image_placeholder>" * 576)
 
 
 def run_deepseek_vl():
-    llm = LLM(
-        model=model,
-        image_token_id=100015,
-        image_input_shape="1,3,1024,1024",
-        image_feature_size=576,
-        gpu_memory_utilization=0.9,
-        max_model_len=3072,
-        enforce_eager=True,
-    )
+    llm = LLM(model=model,
+              max_model_len=3072,
+              enforce_eager=True,
+              dtype=torch.bfloat16)
 
     image = Image.open("images/stop_sign.jpg")
 
@@ -39,8 +32,7 @@ def run_deepseek_vl():
                 "image": image
             },
         },
-        sampling_params=sample_params,
-    )
+        sampling_params=sample_params)
 
     for o in outputs:
         generated_text = o.outputs[0].text
diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index f8b75d5eb5825..53ca234b9f52f 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -244,8 +244,8 @@ def run_test(
             (o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True)))
 
     check_outputs_equal(
-        hf_outputs,
-        [
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=[
             vllm_to_hf_output(vllm_output, vlm_config, model_id)
             for vllm_output in vllm_outputs
         ],
diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py
index 3e10f2c0ee4db..ddf6ee22d09ff 100644
--- a/vllm/model_executor/models/deepseek_vl.py
+++ b/vllm/model_executor/models/deepseek_vl.py
@@ -44,8 +44,8 @@
 from transformers.image_utils import to_numpy_array
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, VisionLanguageConfig
-from vllm.inputs import INPUT_REGISTRY, InputContext
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -55,6 +55,8 @@
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import (cached_get_tokenizer,
+                                   repeat_and_pad_image_tokens)
 from vllm.sequence import SamplerOutput
 from vllm.transformers_utils.configs import DeepSeekMultiModalityConfig
 
@@ -67,6 +69,8 @@
 IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5)
 IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)
 LayerType = Union[str, Callable, Type[torch.nn.Module]]
+IMAGE_FEATURE_SIZE = 576
+IMAGE_TOKEN_ID = 100015
 
 
 # From PyTorch internals
@@ -213,7 +217,7 @@ def drop_path(
 # From https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py # noqa
 class DropPath(nn.Module):
     """
-    Drop paths (Stochastic Depth) per sample  
+    Drop paths (Stochastic Depth) per sample
     (when applied in main path of residual blocks).
     """
 
@@ -390,7 +394,7 @@ def feat_ratio(self, as_scalar=True) -> Union[Tuple[int, int], int]:
             return self.patch_size
 
     def dynamic_feat_size(self, img_size: Tuple[int, int]) -> Tuple[int, int]:
-        """Get grid (feature) size for given image size taking account 
+        """Get grid (feature) size for given image size taking account
            of dynamic padding.
         NOTE: must be torchscript compatible so using fixed tuple indexing
         """
@@ -754,7 +758,7 @@ def forward(
         Args:
             x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
             if it is a tuple of torch.Tensor,
-            then it comes from the hybrid vision encoder, 
+            then it comes from the hybrid vision encoder,
             and x = high_res_x, low_res_x);
             otherwise it is the feature from the single vision encoder.
 
@@ -953,7 +957,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class VisionTransformer(nn.Module):
     """Vision Transformer
 
-    A PyTorch impl of : `An Image is Worth 16x16 Words: 
+    A PyTorch impl of : `An Image is Worth 16x16 Words:
     Transformers for Image Recognition at Scale`
         - https://arxiv.org/abs/2010.11929
     """
@@ -1000,20 +1004,20 @@ def __init__(
             patch_size: Patch size.
             in_chans: Number of image input channels.
             num_classes: Number of classes for classification head.
-            global_pool: Type of global pooling for final sequence 
+            global_pool: Type of global pooling for final sequence
             (default: 'token').
             embed_dim: Transformer embedding dimension.
             depth: Depth of transformer.
             num_heads: Number of attention heads.
             mlp_ratio: Ratio of mlp hidden dim to embedding dim.
             qkv_bias: Enable bias for qkv projections if True.
-            init_values: Layer-scale init values 
+            init_values: Layer-scale init values
             (layer-scale enabled if not None).
             class_token: Use class token.
-            no_embed_class: Don't include position embeddings for class 
+            no_embed_class: Don't include position embeddings for class
             (or reg) tokens.
             reg_tokens: Number of register tokens.
-            fc_norm: Pre head norm after pool (instead of before), if None, 
+            fc_norm: Pre head norm after pool (instead of before), if None,
             enabled when global_pool == 'avg'.
             drop_rate: Head dropout rate.
             pos_drop_rate: Position embedding dropout rate.
@@ -1429,11 +1433,11 @@ def __init__(
             norm_layer (nn.Module): Normalization layer.
             act_layer (nn.Module): Activation layer.
             use_abs_pos (bool): If True, use absolute positional embeddings.
-            use_rel_pos (bool): If True, add relative positional embeddings to 
+            use_rel_pos (bool): If True, add relative positional embeddings to
             the attention map.
-            rel_pos_zero_init (bool): If True, zero initialize relative 
-            positional parameters. window_size (int): Window size for window 
-            attention blocks. global_attn_indexes (list): Indexes for blocks 
+            rel_pos_zero_init (bool): If True, zero initialize relative
+            positional parameters. window_size (int): Window size for window
+            attention blocks. global_attn_indexes (list): Indexes for blocks
             using global attention.
             downsample_channels (list): Channels for downsampling layers.
         """
@@ -1553,7 +1557,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 class Block(nn.Module):
     """
-    Transformer blocks with support of window attention and 
+    Transformer blocks with support of window attention and
     residual propagation blocks
     """
 
@@ -1575,17 +1579,17 @@ def __init__(
             dim (int): Number of input channels.
             num_heads (int): Number of attention heads in each ViT block.
             mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-            qkv_bias (bool): If True, add a learnable bias to 
+            qkv_bias (bool): If True, add a learnable bias to
             query, key, value.
             norm_layer (nn.Module): Normalization layer.
             act_layer (nn.Module): Activation layer.
-            use_rel_pos (bool): If True, add relative positional embeddings to 
+            use_rel_pos (bool): If True, add relative positional embeddings to
             the attention map.
-            rel_pos_zero_init (bool): If True, zero initialize relative 
+            rel_pos_zero_init (bool): If True, zero initialize relative
             positional parameters.
-            window_size (int): Window size for window attention blocks. If 
-            it equals 0, then use global attention. input_size 
-            (tuple(int, int) or None): Input resolution for calculating 
+            window_size (int): Window size for window attention blocks. If
+            it equals 0, then use global attention. input_size
+            (tuple(int, int) or None): Input resolution for calculating
             the relative
                 positional parameter size.
         """
@@ -1643,13 +1647,13 @@ def __init__(
         Args:
             dim (int): Number of input channels.
             num_heads (int): Number of attention heads.
-            qkv_bias (bool):  If True, add a learnable bias to 
+            qkv_bias (bool):  If True, add a learnable bias to
             query, key, value.
-            rel_pos (bool): If True, add relative positional embeddings 
+            rel_pos (bool): If True, add relative positional embeddings
             to the attention map.
-            rel_pos_zero_init (bool): If True, zero initialize relative 
+            rel_pos_zero_init (bool): If True, zero initialize relative
             positional parameters.
-            input_size (tuple(int, int) or None): Input resolution for 
+            input_size (tuple(int, int) or None): Input resolution for
             calculating the relative
                 positional parameter size.
         """
@@ -1738,7 +1742,7 @@ def window_unpartition(
     """
     Window unpartition into original sequences and removing padding.
     Args:
-        windows (tensor): input tokens with 
+        windows (tensor): input tokens with
         [B * num_windows, window_size, window_size, C].
         window_size (int): window size.
         pad_hw (Tuple): padded height and width (Hp, Wp).
@@ -2160,23 +2164,48 @@ def dummy_data_for_deepseek(ctx: InputContext, seq_len: int):
     if not image_size:
         # Get image size for 7b model
         image_size = vision_config.params["high_res_cfg"]["image_size"]
-    seq_data = dummy_seq_data_for_clip(vision_config,
-                                       seq_len,
-                                       image_token_id=100015,
-                                       image_feature_size_override=576)
+    seq_data = dummy_seq_data_for_clip(
+        vision_config,
+        seq_len,
+        image_token_id=IMAGE_TOKEN_ID,
+        image_feature_size_override=IMAGE_FEATURE_SIZE,
+    )
     mm_data = {"image": Image.new("RGB", (image_size, image_size), color=0)}
     return seq_data, mm_data
 
 
+def input_processor_for_deepseek(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return llm_inputs
+    model_config = ctx.model_config
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+    new_prompt, new_token_ids = repeat_and_pad_image_tokens(
+        tokenizer,
+        llm_inputs.get("prompt"),
+        llm_inputs["prompt_token_ids"],
+        image_token_id=IMAGE_TOKEN_ID,
+        repeat_count=IMAGE_FEATURE_SIZE,
+    )
+    return LLMInputs(
+        prompt_token_ids=new_token_ids,
+        prompt=new_prompt,
+        multi_modal_data=multi_modal_data,
+    )
+
+
 @MULTIMODAL_REGISTRY.register_image_input_mapper()
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_deepseek)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_deepseek)
 class DeepSeekMultiModalityCausalLM(nn.Module, SupportsVision):
 
-    def __init__(self,
-                 config: DeepSeekMultiModalityConfig,
-                 vlm_config: VisionLanguageConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+    def __init__(
+        self,
+        config: DeepSeekMultiModalityConfig,
+        multimodal_config: MultiModalConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
         super().__init__()
         self.config = config
         vision_config = config.vision_config
@@ -2264,7 +2293,9 @@ def forward(
         if image_features is not None and pixel_values is None:
             pixel_values = image_features
         if pixel_values is not None:
-            image_token_id = 100015
+            target_dtype = self.lm_head.weight.dtype
+            pixel_values = pixel_values.to(target_dtype)
+            image_token_id = IMAGE_TOKEN_ID
             image_token_mask = input_ids == image_token_id
             inputs_embeds = self.prepare_inputs_embeds(
                 input_ids,

From ffa1cc7783e9a822acf94e3eebe7d2d4d069bcdf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Fri, 5 Jul 2024 11:58:17 +0800
Subject: [PATCH 21/47] Update Test case

---
 tests/models/test_deepseek_vl.py | 226 +++++++++++++++----------------
 1 file changed, 112 insertions(+), 114 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 53ca234b9f52f..3438837033e25 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -4,14 +4,17 @@
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
 
-from vllm.config import VisionLanguageConfig
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
 from vllm.model_executor.models.deepseek_vl import (
     MultiModalityPreTrainedModel, VLMImageProcessor, model_name_to_cls)
 from vllm.transformers_utils.config import DeepSeekMultiModalityConfig
 
-from ..conftest import HfRunner, VllmRunner, _ImageAssets
-from .utils import check_outputs_equal
+from tests.conftest import HfRunner, VllmRunner, _ImageAssets
+from tests.models.utils import check_logprobs_close
 
+models = ["deepseek-ai/deepseek-vl-7b-chat"]
+IMAGE_TOKEN_ID = 100015
 pytestmark = pytest.mark.vlm
 
 # The image token is placed before "user" on purpose so that the test can pass
@@ -85,10 +88,33 @@ def prepare_inputs_embeds(
         return inputs_embeds
 
 
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
+    ]
+
+    assert output_str[0] == " "
+    hf_output_str = output_str[1:]
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
 def get_input(tokenizer, prompt, image):
 
     image_id = 100015
     vl_image = VLMImageProcessor(1024)
+    prompt.replace('<image_placeholder>', '<image_placeholder>' * 576)
     input_ids = tokenizer.encode(prompt)
     input_ids = torch.LongTensor(input_ids)
     image_token_mask = input_ids == image_id
@@ -114,63 +140,16 @@ def get_input(tokenizer, prompt, image):
     return prepare
 
 
-def iter_deepseek_vl_configs(model_name: str):
-    image_hw_to_feature_size = {
-        (336, 336): 576,
-    }
-
-    for (h, w), f in image_hw_to_feature_size.items():
-        input_shape = (1, 3, h, w)
-        yield (model_name,
-               VisionLanguageConfig(image_feature_size=f,
-                                    image_token_id=100015,
-                                    image_input_shape=input_shape))
-
-
-model_and_vl_config = [
-    *iter_deepseek_vl_configs("deepseek-ai/deepseek-vl-7b-chat"),
-]
-
-
-def vllm_to_hf_output(
-    vllm_output: Tuple[List[int], str],
-    vlm_config: VisionLanguageConfig,
-    model_id: str,
-):
-    """Sanitize vllm output to be comparable with hf output.
-    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
-    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
-    It also reduces `output_str` from 
-    "<image_placeholder><image_placeholder>bla" to "bla".
-    """
-    input_ids, output_str = vllm_output
-    image_token_id = vlm_config.image_token_id
-
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    image_token_str = tokenizer.decode(image_token_id)
-
-    hf_input_ids = [
-        input_id for idx, input_id in enumerate(input_ids)
-        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
-    ]
-    hf_output_str = output_str.replace(
-        image_token_str * vlm_config.image_feature_size, "")
-
-    return hf_input_ids, hf_output_str
-
-
-# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
-@pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
 def run_test(
     hf_runner: Type[HfRunner],
     vllm_runner: Type[VllmRunner],
     image_assets: _ImageAssets,
-    model_and_config: Tuple[str, VisionLanguageConfig],
+    model: str,
     *,
+    size_factors: List[float],
     dtype: str,
     max_tokens: int,
+    num_logprobs: int,
     tensor_parallel_size: int,
     distributed_executor_backend: Optional[str] = None,
 ) -> None:
@@ -183,53 +162,57 @@ def run_test(
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
-    model_id, vlm_config = model_and_config
-    hf_images = [asset.for_hf() for asset in image_assets]
-
-    vllm_image_prompts = [
-        p.replace(
-            "<image_placeholder>",
-            "<image_placeholder>" * vlm_config.image_feature_size,
-        ) for p in HF_IMAGE_PROMPTS
-    ]
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
+        ]
 
-    with vllm_runner(
-            model_id,
-            dtype=dtype,
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enforce_eager=True,
-            **vlm_config.as_cli_args_dict(),
-    ) as vllm_model:
-        vllm_images = [asset.for_vllm() for asset in image_assets]
-        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
-                                                  max_tokens,
-                                                  images=vllm_images)
     AutoModelForCausalLM.register(DeepSeekMultiModalityConfig,
                                   MultiModalityCausalLM)
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    hf_model = AutoModelForCausalLM.from_pretrained(model_id,
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    hf_model = AutoModelForCausalLM.from_pretrained(model,
                                                     trust_remote_code=True)
     hf_model = hf_model.to("cuda").eval()
-    prepare_input_one = get_input(
-        tokenizer,
-        vllm_image_prompts[0],
-        hf_images,
-    )
-    prepare_input_two = get_input(
-        tokenizer,
-        vllm_image_prompts[1],
-        hf_images,
-    )
-    prepare_input_one = hf_model.prepare_inputs_embeds(**prepare_input_one)
-    prepare_input_two = hf_model.prepare_inputs_embeds(**prepare_input_two)
-    prepare_input = torch.concat(prepare_input_one, prepare_input_two)
+    prepare_input_list = []
+    inputs_embeds_list = []
+    for prompts, images in inputs_per_image:
+        print(f'prompt: {prompts}')
+        print(f'images: {images}')
+        prepare_input = get_input(
+            tokenizer,
+            prompts,
+            images,
+        )
+        prepare_input_list.append(prepare_input)
+        inputs_embeds_list.append(
+            hf_model.prepare_inputs_embeds(**prepare_input))
+
+    inputs_embeds = torch.concat(inputs_embeds_list)
     attention_mask = torch.concat(
-        prepare_input_one["attention_mask"],
-        prepare_input_two["attention_mask"],
-    )
+        [x['attention_mask'] for x in prepare_input_list])
     outputs = hf_model.generate(
-        inputs_embeds=prepare_input,
+        inputs_embeds=inputs_embeds,
         attention_mask=attention_mask,
         max_new_tokens=max_tokens,
         pad_token_id=tokenizer.eos_token_id,
@@ -239,38 +222,53 @@ def run_test(
         use_cache=True,
     )
     hf_outputs: List = []
+
     for o in outputs:
         hf_outputs.append(
             (o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True)))
 
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=[
-            vllm_to_hf_output(vllm_output, vlm_config, model_id)
-            for vllm_output in vllm_outputs
-        ],
-        name_0="hf",
-        name_1="vllm",
-    )
-
-
-@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+    for hf_outputs, vllm_outputs in zip(hf_outputs, vllm_outputs_per_image):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, model)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+    print('END---->')
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    image_assets,
-    model_and_config,
-    dtype: str,
-    max_tokens: int,
-) -> None:
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
     run_test(
         hf_runner,
         vllm_runner,
         image_assets,
-        model_and_config,
+        model,
+        size_factors=size_factors,
         dtype=dtype,
         max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
         tensor_parallel_size=1,
     )

From 17711727ed9fd623a75f066de20501a2c8c28123 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Fri, 5 Jul 2024 13:04:31 +0800
Subject: [PATCH 22/47] fix Test case

---
 tests/models/test_deepseek_vl.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 3438837033e25..ba671fdc70e42 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -124,18 +124,18 @@ def get_input(tokenizer, prompt, image):
         "sft_format":
         prompt,
         "input_ids":
-        input_ids.to("cuda"),
+        input_ids,
         "pixel_values":
-        images_outputs.pixel_values.to(torch.bfloat16).to("cuda").reshape(
+        images_outputs.pixel_values.to(torch.bfloat16).reshape(
             1, -1, 3, 1024, 1024),
         "num_image_tokens":
         576,
         "images_seq_mask":
-        image_token_mask.to("cuda").reshape(1, -1),
+        image_token_mask.reshape(1, -1),
         "images_emb_mask":
-        images_emb_mask.to("cuda"),
+        images_emb_mask,
         "attention_mask":
-        torch.ones(1, len(input_ids)).to("cuda"),
+        torch.ones(1, len(input_ids)),
     }
     return prepare
 
@@ -193,7 +193,7 @@ def run_test(
     tokenizer = AutoTokenizer.from_pretrained(model)
     hf_model = AutoModelForCausalLM.from_pretrained(model,
                                                     trust_remote_code=True)
-    hf_model = hf_model.to("cuda").eval()
+    hf_model = hf_model
     prepare_input_list = []
     inputs_embeds_list = []
     for prompts, images in inputs_per_image:

From 9287c7de88d9dd0e33c633cccb9096c79846451b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Fri, 5 Jul 2024 13:52:30 +0800
Subject: [PATCH 23/47] Update Test case

---
 tests/models/test_deepseek_vl.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index ba671fdc70e42..9e569d38abbba 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -113,6 +113,8 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
 def get_input(tokenizer, prompt, image):
 
     image_id = 100015
+    prompt = prompt[0]
+    image = image[0]
     vl_image = VLMImageProcessor(1024)
     prompt.replace('<image_placeholder>', '<image_placeholder>' * 576)
     input_ids = tokenizer.encode(prompt)

From 5ff27a67f4983777dd1efb9545f703c2049eda46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Fri, 5 Jul 2024 15:20:34 +0800
Subject: [PATCH 24/47] Add register_max_image_tokens

---
 vllm/model_executor/models/deepseek_vl.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py
index ddf6ee22d09ff..f79d8b03c1c75 100644
--- a/vllm/model_executor/models/deepseek_vl.py
+++ b/vllm/model_executor/models/deepseek_vl.py
@@ -2195,6 +2195,7 @@ def input_processor_for_deepseek(ctx: InputContext, llm_inputs: LLMInputs):
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(IMAGE_FEATURE_SIZE)
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_deepseek)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_deepseek)
 class DeepSeekMultiModalityCausalLM(nn.Module, SupportsVision):

From 6c41130939baa2d4ec2815aa10414c71d6708e0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Fri, 5 Jul 2024 16:58:33 +0800
Subject: [PATCH 25/47] fix test case

---
 tests/models/test_deepseek_vl.py | 35 +++++++++++++-------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 9e569d38abbba..986374dd8e7c5 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -4,14 +4,14 @@
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
 
-from vllm.multimodal.utils import rescale_image_size
-from vllm.sequence import SampleLogprobs
+from vllm import SamplingParams
 from vllm.model_executor.models.deepseek_vl import (
     MultiModalityPreTrainedModel, VLMImageProcessor, model_name_to_cls)
+from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.config import DeepSeekMultiModalityConfig
 
-from tests.conftest import HfRunner, VllmRunner, _ImageAssets
-from tests.models.utils import check_logprobs_close
+from ..conftest import HfRunner, VllmRunner, _ImageAssets
+from .utils import check_outputs_equal
 
 models = ["deepseek-ai/deepseek-vl-7b-chat"]
 IMAGE_TOKEN_ID = 100015
@@ -116,7 +116,7 @@ def get_input(tokenizer, prompt, image):
     prompt = prompt[0]
     image = image[0]
     vl_image = VLMImageProcessor(1024)
-    prompt.replace('<image_placeholder>', '<image_placeholder>' * 576)
+    prompt = prompt.replace('<image_placeholder>', '<image_placeholder>' * 576)
     input_ids = tokenizer.encode(prompt)
     input_ids = torch.LongTensor(input_ids)
     image_token_mask = input_ids == image_id
@@ -148,7 +148,6 @@ def run_test(
     image_assets: _ImageAssets,
     model: str,
     *,
-    size_factors: List[float],
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
@@ -166,10 +165,8 @@ def run_test(
     """
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    inputs_per_image = [([prompt], [image])
+                        for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
 
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
@@ -177,6 +174,7 @@ def run_test(
     # will hurt multiprocessing backend with fork method (the default method).
 
     # max_model_len should be greater than image_feature_size
+    sample_params = SamplingParams(temperature=0)
     with vllm_runner(model,
                      dtype=dtype,
                      tensor_parallel_size=tensor_parallel_size,
@@ -186,7 +184,8 @@ def run_test(
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
-                                                images=images)
+                                                images=images,
+                                                sampling_params=sample_params)
             for prompts, images in inputs_per_image
         ]
 
@@ -229,15 +228,10 @@ def run_test(
         hf_outputs.append(
             (o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True)))
 
-    for hf_outputs, vllm_outputs in zip(hf_outputs, vllm_outputs_per_image):
-        # TODO: Check whether using original CLIPVisionModel can improve
-        # consistency against HF
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
-            ],
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs_per_image):
+        check_outputs_equal(
+            outputs_0_lst=hf_output,
+            outputs_1_lst=vllm_output[:2],
             name_0="hf",
             name_1="vllm",
         )
@@ -268,7 +262,6 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
         vllm_runner,
         image_assets,
         model,
-        size_factors=size_factors,
         dtype=dtype,
         max_tokens=max_tokens,
         num_logprobs=num_logprobs,

From 0e1bc5be318beab3b4d1afcd3cdaf4747549ab3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Fri, 5 Jul 2024 17:44:39 +0800
Subject: [PATCH 26/47] fix test case

---
 tests/models/test_deepseek_vl.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 986374dd8e7c5..841ff56a07067 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -4,7 +4,6 @@
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
 
-from vllm import SamplingParams
 from vllm.model_executor.models.deepseek_vl import (
     MultiModalityPreTrainedModel, VLMImageProcessor, model_name_to_cls)
 from vllm.sequence import SampleLogprobs
@@ -174,7 +173,6 @@ def run_test(
     # will hurt multiprocessing backend with fork method (the default method).
 
     # max_model_len should be greater than image_feature_size
-    sample_params = SamplingParams(temperature=0)
     with vllm_runner(model,
                      dtype=dtype,
                      tensor_parallel_size=tensor_parallel_size,
@@ -184,8 +182,7 @@ def run_test(
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
-                                                images=images,
-                                                sampling_params=sample_params)
+                                                images=images)
             for prompts, images in inputs_per_image
         ]
 

From 3b3b8eceaff72e5838b7fee5595675b9e8b7424b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Mon, 8 Jul 2024 10:30:41 +0800
Subject: [PATCH 27/47] fix test dtype error

---
 tests/models/test_deepseek_vl.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 841ff56a07067..d4a3835473775 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -109,7 +109,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
-def get_input(tokenizer, prompt, image):
+def get_input(tokenizer, prompt, image, dtype):
 
     image_id = 100015
     prompt = prompt[0]
@@ -127,8 +127,7 @@ def get_input(tokenizer, prompt, image):
         "input_ids":
         input_ids,
         "pixel_values":
-        images_outputs.pixel_values.to(torch.bfloat16).reshape(
-            1, -1, 3, 1024, 1024),
+        images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 1024, 1024),
         "num_image_tokens":
         576,
         "images_seq_mask":
@@ -191,17 +190,21 @@ def run_test(
     tokenizer = AutoTokenizer.from_pretrained(model)
     hf_model = AutoModelForCausalLM.from_pretrained(model,
                                                     trust_remote_code=True)
+    dtype_dict = {
+        'float16': torch.float16,
+        'half': torch.bfloat16,
+        'float32': torch.float32,
+        'auto': hf_model.dtype
+    }
+    dtype = dtype_dict.get(dtype, hf_model.dtype)
+    hf_model = hf_model.to(dtype)
     hf_model = hf_model
     prepare_input_list = []
     inputs_embeds_list = []
     for prompts, images in inputs_per_image:
         print(f'prompt: {prompts}')
         print(f'images: {images}')
-        prepare_input = get_input(
-            tokenizer,
-            prompts,
-            images,
-        )
+        prepare_input = get_input(tokenizer, prompts, images, dtype)
         prepare_input_list.append(prepare_input)
         inputs_embeds_list.append(
             hf_model.prepare_inputs_embeds(**prepare_input))

From f2f29d1bbd122b0077ad2345129779cacbefd848 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Mon, 8 Jul 2024 10:37:02 +0800
Subject: [PATCH 28/47] fix test dtype error

---
 tests/models/test_deepseek_vl.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index d4a3835473775..77c7277b061fa 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -191,6 +191,7 @@ def run_test(
     hf_model = AutoModelForCausalLM.from_pretrained(model,
                                                     trust_remote_code=True)
     dtype_dict = {
+        'bfloat16': torch.bfloat16,
         'float16': torch.float16,
         'half': torch.bfloat16,
         'float32': torch.float32,

From 03133bbc3c20030c03479ae6ddbb52969a3fac55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Mon, 8 Jul 2024 15:16:11 +0800
Subject: [PATCH 29/47] update doc and fix test error

---
 docs/source/models/supported_models.rst |  8 ++--
 tests/models/test_deepseek_vl.py        | 60 ++++++++++++-------------
 2 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index fefda5d39e7a1..33e8b261f00ed 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -161,10 +161,6 @@ Decoder-only Language Models
     - Xverse
     - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc.
     -
-  * - :code:`DeepSeekMultiModalityCausalLM`
-    - deepseek-ai
-    - :code:`deepseek-ai/deepseek-vl-1.3b-chat`, :code:`deepseek-ai/deepseek-vl-7b-chat`, etc.
-    -
 
 .. note::
     Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
@@ -198,6 +194,10 @@ Vision Language Models
     - Phi-3-Vision
     - :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
     -
+  * - :code:`DeepSeekMultiModalityCausalLM`
+    - deepseek-ai
+    - :code:`deepseek-ai/deepseek-vl-1.3b-chat`, :code:`deepseek-ai/deepseek-vl-7b-chat`, etc.
+    -
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>` 
diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 77c7277b061fa..b96561e0d0cc8 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -199,43 +199,39 @@ def run_test(
     }
     dtype = dtype_dict.get(dtype, hf_model.dtype)
     hf_model = hf_model.to(dtype)
-    hf_model = hf_model
-    prepare_input_list = []
-    inputs_embeds_list = []
+    hf_outputs: List = []
     for prompts, images in inputs_per_image:
         print(f'prompt: {prompts}')
         print(f'images: {images}')
         prepare_input = get_input(tokenizer, prompts, images, dtype)
-        prepare_input_list.append(prepare_input)
-        inputs_embeds_list.append(
-            hf_model.prepare_inputs_embeds(**prepare_input))
-
-    inputs_embeds = torch.concat(inputs_embeds_list)
-    attention_mask = torch.concat(
-        [x['attention_mask'] for x in prepare_input_list])
-    outputs = hf_model.generate(
-        inputs_embeds=inputs_embeds,
-        attention_mask=attention_mask,
-        max_new_tokens=max_tokens,
-        pad_token_id=tokenizer.eos_token_id,
-        bos_token_id=tokenizer.bos_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        do_sample=False,
-        use_cache=True,
-    )
-    hf_outputs: List = []
-
-    for o in outputs:
-        hf_outputs.append(
-            (o, tokenizer.decode(o.cpu().tolist(), skip_special_tokens=True)))
-
-    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs_per_image):
-        check_outputs_equal(
-            outputs_0_lst=hf_output,
-            outputs_1_lst=vllm_output[:2],
-            name_0="hf",
-            name_1="vllm",
+        attention_mask = prepare_input['attention_mask']
+        inputs_embeds = hf_model.prepare_inputs_embeds(**prepare_input)
+        outputs = hf_model.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            max_new_tokens=max_tokens,
+            pad_token_id=tokenizer.eos_token_id,
+            bos_token_id=tokenizer.bos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            do_sample=False,
+            use_cache=True,
         )
+        for o in outputs:
+            hf_outputs.append((o.cpu().tolist(),
+                               tokenizer.decode(o.cpu().tolist(),
+                                                skip_special_tokens=True)))
+    vllm_outputs_list = []
+    for vllm_outputs in vllm_outputs_per_image:
+        vllm_outputs_list.append([
+            vllm_to_hf_output(vllm_output, model)
+            for vllm_output in vllm_outputs
+        ][:2])
+    print(f'hf_outputs --> {hf_outputs}')
+    print(f'vllm_outputs --> {vllm_outputs_list}')
+    check_outputs_equal(outputs_0_lst=hf_outputs,
+                        outputs_1_lst=vllm_outputs_list,
+                        name_0='hf',
+                        name_1='vllm')
     print('END---->')
 
 

From ff6c75816aa51e1494e95634f9f7e10a51a86732 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Mon, 8 Jul 2024 15:25:40 +0800
Subject: [PATCH 30/47] fix mypy error

---
 tests/models/test_deepseek_vl.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index b96561e0d0cc8..20771f7027acf 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -222,10 +222,11 @@ def run_test(
                                                 skip_special_tokens=True)))
     vllm_outputs_list = []
     for vllm_outputs in vllm_outputs_per_image:
-        vllm_outputs_list.append([
-            vllm_to_hf_output(vllm_output, model)
-            for vllm_output in vllm_outputs
-        ][:2])
+        vllm_outputs_list.append(
+            tuple([
+                vllm_to_hf_output(vllm_output, model)
+                for vllm_output in vllm_outputs
+            ][:2]))
     print(f'hf_outputs --> {hf_outputs}')
     print(f'vllm_outputs --> {vllm_outputs_list}')
     check_outputs_equal(outputs_0_lst=hf_outputs,

From 9d1f68e48ca8e7b4808d9d225acac1637363552d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Tue, 9 Jul 2024 09:24:51 +0800
Subject: [PATCH 31/47] use 1.3b model

---
 tests/models/test_deepseek_vl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 20771f7027acf..75c1ddc8b4f91 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -12,7 +12,7 @@
 from ..conftest import HfRunner, VllmRunner, _ImageAssets
 from .utils import check_outputs_equal
 
-models = ["deepseek-ai/deepseek-vl-7b-chat"]
+models = ["deepseek-ai/deepseek-vl-1.3b-chat"]
 IMAGE_TOKEN_ID = 100015
 pytestmark = pytest.mark.vlm
 

From 2025c39c87894c05251639e99688a649f8593f0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Tue, 9 Jul 2024 10:10:51 +0800
Subject: [PATCH 32/47] use 1.3b model

---
 tests/models/test_deepseek_vl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 75c1ddc8b4f91..2f21278763bd8 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -114,7 +114,7 @@ def get_input(tokenizer, prompt, image, dtype):
     image_id = 100015
     prompt = prompt[0]
     image = image[0]
-    vl_image = VLMImageProcessor(1024)
+    vl_image = VLMImageProcessor(384)
     prompt = prompt.replace('<image_placeholder>', '<image_placeholder>' * 576)
     input_ids = tokenizer.encode(prompt)
     input_ids = torch.LongTensor(input_ids)
@@ -184,7 +184,7 @@ def run_test(
                                                 images=images)
             for prompts, images in inputs_per_image
         ]
-
+    print(f'vllm_outputs_per_image -> {vllm_outputs_per_image}')
     AutoModelForCausalLM.register(DeepSeekMultiModalityConfig,
                                   MultiModalityCausalLM)
     tokenizer = AutoTokenizer.from_pretrained(model)

From 773aec876ddb6adc001f6846c6bfd69805bb3506 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Tue, 9 Jul 2024 10:56:43 +0800
Subject: [PATCH 33/47] use 1.3b model

---
 tests/models/test_deepseek_vl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 2f21278763bd8..dfe4c0f3c5674 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -127,7 +127,7 @@ def get_input(tokenizer, prompt, image, dtype):
         "input_ids":
         input_ids,
         "pixel_values":
-        images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 1024, 1024),
+        images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 384, 384),
         "num_image_tokens":
         576,
         "images_seq_mask":

From ee5a3db135b41b063d607636f7bb4507eab5df77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Tue, 9 Jul 2024 16:09:15 +0800
Subject: [PATCH 34/47] use 1.3b model

---
 tests/models/test_deepseek_vl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index dfe4c0f3c5674..fbc514a2d78e7 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -77,7 +77,7 @@ def prepare_inputs_embeds(
         # [b, T, D]
         input_ids[input_ids < 0] = 0  # ignore the image embeddings
         inputs_embeds = self.language_model.get_input_embeddings()(
-            input_ids).reshape(1, -1, 4096)
+            input_ids).reshape(1, -1, 2048)
 
         # replace with the image embeddings
         images_embeds = images_embeds.reshape(

From 71ea404b602390dfc6523f2dad77429122e97c5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Tue, 9 Jul 2024 17:10:04 +0800
Subject: [PATCH 35/47] use gpu

---
 tests/models/test_deepseek_vl.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index fbc514a2d78e7..09db2367c1cf5 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -109,7 +109,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
-def get_input(tokenizer, prompt, image, dtype):
+def get_input(tokenizer, prompt, image, dtype, device):
 
     image_id = 100015
     prompt = prompt[0]
@@ -125,17 +125,17 @@ def get_input(tokenizer, prompt, image, dtype):
         "sft_format":
         prompt,
         "input_ids":
-        input_ids,
+        input_ids.to(device),
         "pixel_values":
-        images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 384, 384),
+        images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 384, 384).to(device),
         "num_image_tokens":
         576,
         "images_seq_mask":
-        image_token_mask.reshape(1, -1),
+        image_token_mask.reshape(1, -1).to(device),
         "images_emb_mask":
-        images_emb_mask,
+        images_emb_mask.to(device),
         "attention_mask":
-        torch.ones(1, len(input_ids)),
+        torch.ones(1, len(input_ids)).to(device),
     }
     return prepare
 
@@ -190,6 +190,7 @@ def run_test(
     tokenizer = AutoTokenizer.from_pretrained(model)
     hf_model = AutoModelForCausalLM.from_pretrained(model,
                                                     trust_remote_code=True)
+    device = 'cuda'
     dtype_dict = {
         'bfloat16': torch.bfloat16,
         'float16': torch.float16,
@@ -198,7 +199,7 @@ def run_test(
         'auto': hf_model.dtype
     }
     dtype = dtype_dict.get(dtype, hf_model.dtype)
-    hf_model = hf_model.to(dtype)
+    hf_model = hf_model.to(dtype).to(device)
     hf_outputs: List = []
     for prompts, images in inputs_per_image:
         print(f'prompt: {prompts}')

From 48862caf429ae75d29bbd21695aed8763d9a7df7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Tue, 9 Jul 2024 17:15:36 +0800
Subject: [PATCH 36/47] use gpu

---
 tests/models/test_deepseek_vl.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 09db2367c1cf5..fbac01f979a3b 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -127,7 +127,8 @@ def get_input(tokenizer, prompt, image, dtype, device):
         "input_ids":
         input_ids.to(device),
         "pixel_values":
-        images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 384, 384).to(device),
+        images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 384,
+                                                      384).to(device),
         "num_image_tokens":
         576,
         "images_seq_mask":
@@ -204,7 +205,7 @@ def run_test(
     for prompts, images in inputs_per_image:
         print(f'prompt: {prompts}')
         print(f'images: {images}')
-        prepare_input = get_input(tokenizer, prompts, images, dtype)
+        prepare_input = get_input(tokenizer, prompts, images, dtype, device)
         attention_mask = prepare_input['attention_mask']
         inputs_embeds = hf_model.prepare_inputs_embeds(**prepare_input)
         outputs = hf_model.language_model.generate(

From 348064acb180060bcf935603018939bef441fb6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Fri, 12 Jul 2024 10:47:29 +0800
Subject: [PATCH 37/47] fix Conflicting files and update test

---
 requirements-common.txt           |   5 +-
 tests/models/test_deepseek_vl.py  | 277 ++++++++++++++++++++----------
 vllm/transformers_utils/config.py |   8 +-
 3 files changed, 198 insertions(+), 92 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 5435707bb5c6b..9521b80d7efe9 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -18,6 +18,7 @@ prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.10.1
-outlines >= 0.0.43 # Requires torch >= 2.1.0
+outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
 typing_extensions
-filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
\ No newline at end of file
+filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+pyzmq
\ No newline at end of file
diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index fbac01f979a3b..d31aa0ad4e0ea 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -1,18 +1,22 @@
 from typing import List, Optional, Tuple, Type
+from dataclasses import dataclass
 
 import pytest
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
+from transformers import AutoTokenizer, LlamaForCausalLM, AutoModelForVision2Seq
+from transformers import LlamaTokenizerFast
+from transformers.processing_utils import ProcessorMixin
 
 from vllm.model_executor.models.deepseek_vl import (
     MultiModalityPreTrainedModel, VLMImageProcessor, model_name_to_cls)
 from vllm.sequence import SampleLogprobs
+from vllm.multimodal.utils import rescale_image_size
 from vllm.transformers_utils.config import DeepSeekMultiModalityConfig
 
 from ..conftest import HfRunner, VllmRunner, _ImageAssets
-from .utils import check_outputs_equal
+from .utils import check_logprobs_close
 
-models = ["deepseek-ai/deepseek-vl-1.3b-chat"]
+models = ["/deepseek-ai/deepseek-vl-1.3b-chat"]
 IMAGE_TOKEN_ID = 100015
 pytestmark = pytest.mark.vlm
 
@@ -29,6 +33,39 @@
 ]
 
 
+class DictOutput(object):
+
+    def keys(self):
+        return self.__dict__.keys()
+
+    def __getitem__(self, item):
+        return self.__dict__[item]
+
+    def __setitem__(self, key, value):
+        self.__dict__[key] = value
+
+
+@dataclass
+class VLChatProcessorOutput(DictOutput):
+    sft_format: List[str]
+    input_ids: torch.Tensor
+    pixel_values: torch.Tensor
+    attention_mask: torch.Tensor
+    images_seq_mask: torch.BoolTensor
+    images_emb_mask: torch.BoolTensor
+
+    def __len__(self):
+        return len(self.input_ids)
+
+    def to(self, device):
+        self.input_ids = self.input_ids.to(device)
+        self.attention_mask = self.attention_mask.to(device)
+        self.images_seq_mask = self.images_seq_mask.to(device)
+        self.images_emb_mask = self.images_emb_mask.to(device)
+        self.pixel_values = self.pixel_values.to(device=device)
+        return self
+
+
 class MultiModalityCausalLM(MultiModalityPreTrainedModel):
 
     def __init__(self, config: DeepSeekMultiModalityConfig):
@@ -67,6 +104,7 @@ def prepare_inputs_embeds(
 
         bs, n = pixel_values.shape[0:2]
         p_b, p_n, p_c, p_h, p_w = pixel_values.shape
+        pixel_values = pixel_values.to(self.dtype)
         images = pixel_values.reshape(p_b * p_n, p_c, p_h, p_w)
         images_embeds = self.aligner(self.vision_model(images))
 
@@ -86,6 +124,121 @@ def prepare_inputs_embeds(
 
         return inputs_embeds
 
+    def generate(self, *args, **kwargs):
+
+        sft_format = kwargs.pop('sft_format')
+        pixel_values = kwargs.pop('pixel_values')
+        images_seq_mask = kwargs.pop('images_seq_mask')
+        images_emb_mask = kwargs.pop('images_emb_mask')
+        input_ids = kwargs.pop('input_ids')
+        inputs_embeds = self.prepare_inputs_embeds(input_ids, pixel_values,
+                                                   images_seq_mask)
+        tokenizer = AutoTokenizer.from_pretrained(
+            "/pretrained_models/deepseek-vl-1.3b-chat")
+        output = self.language_model.generate(
+            *args,
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            pad_token_id=tokenizer.eos_token_id,
+            bos_token_id=tokenizer.bos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            **kwargs)
+        # output.sequences[0] = torch.concat([input_ids[0], output.sequences[0]])
+        return output
+
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+
+class VLChatProcessor(ProcessorMixin):
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+
+    attributes = ["image_processor", "tokenizer"]
+
+    system_prompt = (
+        "You are a helpful language and vision assistant. "
+        "You are able to understand the visual content that the user provides, "
+        "and assist the user with a variety of tasks using natural language.")
+
+    def __init__(
+        self,
+        image_processor: VLMImageProcessor,
+        tokenizer: LlamaTokenizerFast,
+        image_tag: str = "<image_placeholder>",
+        num_image_tokens: int = 576,
+        add_special_token: bool = False,
+        sft_format: str = "deepseek",
+        mask_prompt: bool = True,
+        ignore_id: int = -100,
+        **kwargs,
+    ):
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+
+        image_id = self.tokenizer.vocab.get(image_tag)
+        if image_id is None:
+            special_tokens = [image_tag]
+            special_tokens_dict = {"additional_special_tokens": special_tokens}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+            print(f"Add image tag = {image_tag} to the tokenizer")
+
+        self.image_tag = image_tag
+        self.num_image_tokens = num_image_tokens
+        self.add_special_token = add_special_token
+        self.sft_format = sft_format
+        self.mask_prompt = mask_prompt
+        self.ignore_id = ignore_id
+        self.image_id = image_id
+
+        super().__init__(
+            image_processor,
+            tokenizer,
+            image_tag,
+            num_image_tokens,
+            add_special_token,
+            sft_format,
+            mask_prompt,
+            ignore_id,
+            **kwargs,
+        )
+
+    def __call__(self, *arg, **kwargs):
+        prompt = kwargs.pop('text')
+        image = kwargs.pop('images')
+        return VLChatProcessorOutput(**self.get_input(prompt, image))
+
+    def get_input(self, prompt, image):
+        prompt = prompt
+        image = image
+        prompt = prompt.replace(self.image_tag,
+                                self.image_tag * self.num_image_tokens)
+        input_ids = self.tokenizer.encode(prompt)
+        input_ids = torch.LongTensor(input_ids)
+        image_token_mask = input_ids == self.image_id
+        images_outputs = self.image_processor(image, return_tensors="pt")
+        images_emb_mask = torch.ones(1, 1, self.num_image_tokens) == 1
+        image_size = self.image_processor.image_size
+        prepare = {
+            "sft_format":
+            prompt,
+            "input_ids":
+            input_ids.reshape(1, -1),
+            "pixel_values":
+            images_outputs.pixel_values.reshape(1, -1, 3, image_size,
+                                                image_size),
+            "images_seq_mask":
+            image_token_mask.reshape(1, -1),
+            "images_emb_mask":
+            images_emb_mask,
+            "attention_mask":
+            torch.ones(1, len(input_ids)),
+        }
+        return prepare
+
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
                                          Optional[SampleLogprobs]],
@@ -109,44 +262,13 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
-def get_input(tokenizer, prompt, image, dtype, device):
-
-    image_id = 100015
-    prompt = prompt[0]
-    image = image[0]
-    vl_image = VLMImageProcessor(384)
-    prompt = prompt.replace('<image_placeholder>', '<image_placeholder>' * 576)
-    input_ids = tokenizer.encode(prompt)
-    input_ids = torch.LongTensor(input_ids)
-    image_token_mask = input_ids == image_id
-    images_outputs = vl_image(image, return_tensors="pt")
-    images_emb_mask = torch.ones(1, 1, 576) == 1
-    prepare = {
-        "sft_format":
-        prompt,
-        "input_ids":
-        input_ids.to(device),
-        "pixel_values":
-        images_outputs.pixel_values.to(dtype).reshape(1, -1, 3, 384,
-                                                      384).to(device),
-        "num_image_tokens":
-        576,
-        "images_seq_mask":
-        image_token_mask.reshape(1, -1).to(device),
-        "images_emb_mask":
-        images_emb_mask.to(device),
-        "attention_mask":
-        torch.ones(1, len(input_ids)).to(device),
-    }
-    return prepare
-
-
 def run_test(
     hf_runner: Type[HfRunner],
     vllm_runner: Type[VllmRunner],
     image_assets: _ImageAssets,
     model: str,
     *,
+    size_factors: List[float],
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
@@ -164,8 +286,10 @@ def run_test(
     """
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_image = [([prompt], [image])
-                        for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
 
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
@@ -185,56 +309,34 @@ def run_test(
                                                 images=images)
             for prompts, images in inputs_per_image
         ]
-    print(f'vllm_outputs_per_image -> {vllm_outputs_per_image}')
-    AutoModelForCausalLM.register(DeepSeekMultiModalityConfig,
-                                  MultiModalityCausalLM)
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    hf_model = AutoModelForCausalLM.from_pretrained(model,
-                                                    trust_remote_code=True)
-    device = 'cuda'
-    dtype_dict = {
-        'bfloat16': torch.bfloat16,
-        'float16': torch.float16,
-        'half': torch.bfloat16,
-        'float32': torch.float32,
-        'auto': hf_model.dtype
-    }
-    dtype = dtype_dict.get(dtype, hf_model.dtype)
-    hf_model = hf_model.to(dtype).to(device)
-    hf_outputs: List = []
-    for prompts, images in inputs_per_image:
-        print(f'prompt: {prompts}')
-        print(f'images: {images}')
-        prepare_input = get_input(tokenizer, prompts, images, dtype, device)
-        attention_mask = prepare_input['attention_mask']
-        inputs_embeds = hf_model.prepare_inputs_embeds(**prepare_input)
-        outputs = hf_model.language_model.generate(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            max_new_tokens=max_tokens,
-            pad_token_id=tokenizer.eos_token_id,
-            bos_token_id=tokenizer.bos_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-            do_sample=False,
-            use_cache=True,
+    # AutoModelForCausalLM.register(DeepSeekMultiModalityConfig,
+    #                               MultiModalityCausalLM)
+    AutoModelForVision2Seq.register(DeepSeekMultiModalityConfig,
+                                    MultiModalityCausalLM)
+
+    with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_model.processor = VLChatProcessor.from_pretrained(model)
+
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images)
+            for prompts, images in inputs_per_image
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        print(f'hf_outputs: {hf_outputs}')
+        print(f'vllm_outputs: {vllm_outputs}')
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
         )
-        for o in outputs:
-            hf_outputs.append((o.cpu().tolist(),
-                               tokenizer.decode(o.cpu().tolist(),
-                                                skip_special_tokens=True)))
-    vllm_outputs_list = []
-    for vllm_outputs in vllm_outputs_per_image:
-        vllm_outputs_list.append(
-            tuple([
-                vllm_to_hf_output(vllm_output, model)
-                for vllm_output in vllm_outputs
-            ][:2]))
-    print(f'hf_outputs --> {hf_outputs}')
-    print(f'vllm_outputs --> {vllm_outputs_list}')
-    check_outputs_equal(outputs_0_lst=hf_outputs,
-                        outputs_1_lst=vllm_outputs_list,
-                        name_0='hf',
-                        name_1='vllm')
     print('END---->')
 
 
@@ -248,7 +350,7 @@ def run_test(
         [1.0],
         # Single-scale, batched
         [1.0, 1.0, 1.0],
-        # Multi-scale
+        # # Multi-scale
         [0.25, 0.5, 1.0],
     ],
 )
@@ -262,6 +364,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
         vllm_runner,
         image_assets,
         model,
+        size_factors=size_factors,
         dtype=dtype,
         max_tokens=max_tokens,
         num_logprobs=num_logprobs,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 60d5a8a20a36c..41156e5a54f09 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -6,9 +6,10 @@
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             DeepSeekMultiModalityConfig,
-                                             JAISConfig, MLPSpeculatorConfig,
-                                             MPTConfig, RWConfig)
+                                             JAISConfig, MedusaConfig,
+                                             MLPSpeculatorConfig, MPTConfig,
+                                             RWConfig,
+                                             DeepSeekMultiModalityConfig)
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -25,6 +26,7 @@
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
     "jais": JAISConfig,
     "mlp_speculator": MLPSpeculatorConfig,
+    "medusa": MedusaConfig,
     "multi_modality": DeepSeekMultiModalityConfig,
 }
 

From 0748ce41d090418d45e12873aff9b39475095b98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Fri, 12 Jul 2024 12:20:10 +0800
Subject: [PATCH 38/47] update test

---
 tests/models/test_deepseek_vl.py  | 21 +++++----------------
 vllm/adapter_commons/layers.py    |  2 +-
 vllm/prompt_adapter/layers.py     |  2 +-
 vllm/transformers_utils/config.py |  5 ++---
 4 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index d31aa0ad4e0ea..139e267b14896 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -1,22 +1,22 @@
-from typing import List, Optional, Tuple, Type
 from dataclasses import dataclass
+from typing import List, Optional, Tuple, Type
 
 import pytest
 import torch
-from transformers import AutoTokenizer, LlamaForCausalLM, AutoModelForVision2Seq
-from transformers import LlamaTokenizerFast
+from transformers import (AutoModelForVision2Seq, AutoTokenizer,
+                          LlamaForCausalLM, LlamaTokenizerFast)
 from transformers.processing_utils import ProcessorMixin
 
 from vllm.model_executor.models.deepseek_vl import (
     MultiModalityPreTrainedModel, VLMImageProcessor, model_name_to_cls)
-from vllm.sequence import SampleLogprobs
 from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.config import DeepSeekMultiModalityConfig
 
 from ..conftest import HfRunner, VllmRunner, _ImageAssets
 from .utils import check_logprobs_close
 
-models = ["/deepseek-ai/deepseek-vl-1.3b-chat"]
+models = ["deepseek-ai/deepseek-vl-1.3b-chat"]
 IMAGE_TOKEN_ID = 100015
 pytestmark = pytest.mark.vlm
 
@@ -47,12 +47,10 @@ def __setitem__(self, key, value):
 
 @dataclass
 class VLChatProcessorOutput(DictOutput):
-    sft_format: List[str]
     input_ids: torch.Tensor
     pixel_values: torch.Tensor
     attention_mask: torch.Tensor
     images_seq_mask: torch.BoolTensor
-    images_emb_mask: torch.BoolTensor
 
     def __len__(self):
         return len(self.input_ids)
@@ -61,7 +59,6 @@ def to(self, device):
         self.input_ids = self.input_ids.to(device)
         self.attention_mask = self.attention_mask.to(device)
         self.images_seq_mask = self.images_seq_mask.to(device)
-        self.images_emb_mask = self.images_emb_mask.to(device)
         self.pixel_values = self.pixel_values.to(device=device)
         return self
 
@@ -126,10 +123,8 @@ def prepare_inputs_embeds(
 
     def generate(self, *args, **kwargs):
 
-        sft_format = kwargs.pop('sft_format')
         pixel_values = kwargs.pop('pixel_values')
         images_seq_mask = kwargs.pop('images_seq_mask')
-        images_emb_mask = kwargs.pop('images_emb_mask')
         input_ids = kwargs.pop('input_ids')
         inputs_embeds = self.prepare_inputs_embeds(input_ids, pixel_values,
                                                    images_seq_mask)
@@ -143,7 +138,6 @@ def generate(self, *args, **kwargs):
             bos_token_id=tokenizer.bos_token_id,
             eos_token_id=tokenizer.eos_token_id,
             **kwargs)
-        # output.sequences[0] = torch.concat([input_ids[0], output.sequences[0]])
         return output
 
     def get_output_embeddings(self):
@@ -220,11 +214,8 @@ def get_input(self, prompt, image):
         input_ids = torch.LongTensor(input_ids)
         image_token_mask = input_ids == self.image_id
         images_outputs = self.image_processor(image, return_tensors="pt")
-        images_emb_mask = torch.ones(1, 1, self.num_image_tokens) == 1
         image_size = self.image_processor.image_size
         prepare = {
-            "sft_format":
-            prompt,
             "input_ids":
             input_ids.reshape(1, -1),
             "pixel_values":
@@ -232,8 +223,6 @@ def get_input(self, prompt, image):
                                                 image_size),
             "images_seq_mask":
             image_token_mask.reshape(1, -1),
-            "images_emb_mask":
-            images_emb_mask,
             "attention_mask":
             torch.ones(1, len(input_ids)),
         }
diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py
index 3ed60678b52f5..39ef9643fc3ec 100644
--- a/vllm/adapter_commons/layers.py
+++ b/vllm/adapter_commons/layers.py
@@ -11,4 +11,4 @@ class AdapterMapping:
 
     def __post_init__(self):
         self.index_mapping = tuple(self.index_mapping)
-        self.prompt_mapping = tuple(self.prompt_mapping)
\ No newline at end of file
+        self.prompt_mapping = tuple(self.prompt_mapping)
diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py
index 27a61e692e1b7..8f5374bb6c92b 100644
--- a/vllm/prompt_adapter/layers.py
+++ b/vllm/prompt_adapter/layers.py
@@ -77,4 +77,4 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
             # Update hidden states
             hidden_states[valid_mask] = gathered_embeddings
-        return hidden_states
\ No newline at end of file
+        return hidden_states
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 3bd334a14dd4a..f1f80599b2ea7 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -6,10 +6,10 @@
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
+                                             DeepSeekMultiModalityConfig,
                                              JAISConfig, MedusaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
-                                             RWConfig,
-                                             DeepSeekMultiModalityConfig)
+                                             RWConfig)
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -106,4 +106,3 @@ def try_get_generation_config(
             return GenerationConfig.from_model_config(config)
         except OSError:  # Not found
             return None
-

From a063b71e787e4849d2d059c0699a7480b21863d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Mon, 15 Jul 2024 15:14:07 +0800
Subject: [PATCH 39/47] update test

---
 tests/entrypoints/openai/conftest.py |  2 +-
 tests/models/test_deepseek_vl.py     | 29 ++++++++++------------------
 2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/tests/entrypoints/openai/conftest.py b/tests/entrypoints/openai/conftest.py
index 0837644f26bde..3c48b4273634d 100644
--- a/tests/entrypoints/openai/conftest.py
+++ b/tests/entrypoints/openai/conftest.py
@@ -66,4 +66,4 @@ def sample_sql_statements():
 table: "table_1" | "table_2"
 condition: column "=" number
 number: "1" | "2"
-""")
\ No newline at end of file
+""")
diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 139e267b14896..8797769500e91 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -110,13 +110,8 @@ def prepare_inputs_embeds(
         images_embeds = images_embeds.reshape(bs, n * t, d)
 
         # [b, T, D]
-        input_ids[input_ids < 0] = 0  # ignore the image embeddings
-        inputs_embeds = self.language_model.get_input_embeddings()(
-            input_ids).reshape(1, -1, 2048)
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
 
-        # replace with the image embeddings
-        images_embeds = images_embeds.reshape(
-            1, -1, self.config.aligner_config.params["n_embed"])
         inputs_embeds[images_seq_mask] = images_embeds
 
         return inputs_embeds
@@ -128,15 +123,13 @@ def generate(self, *args, **kwargs):
         input_ids = kwargs.pop('input_ids')
         inputs_embeds = self.prepare_inputs_embeds(input_ids, pixel_values,
                                                    images_seq_mask)
-        tokenizer = AutoTokenizer.from_pretrained(
-            "/pretrained_models/deepseek-vl-1.3b-chat")
         output = self.language_model.generate(
             *args,
             input_ids=input_ids,
             inputs_embeds=inputs_embeds,
-            pad_token_id=tokenizer.eos_token_id,
-            bos_token_id=tokenizer.bos_token_id,
-            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=self.tokenizer.eos_token_id,
+            bos_token_id=self.tokenizer.bos_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
             **kwargs)
         return output
 
@@ -206,8 +199,6 @@ def __call__(self, *arg, **kwargs):
         return VLChatProcessorOutput(**self.get_input(prompt, image))
 
     def get_input(self, prompt, image):
-        prompt = prompt
-        image = image
         prompt = prompt.replace(self.image_tag,
                                 self.image_tag * self.num_image_tokens)
         input_ids = self.tokenizer.encode(prompt)
@@ -279,7 +270,7 @@ def run_test(
         [prompt for _ in size_factors],
         [rescale_image_size(image, factor) for factor in size_factors],
     ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
+    print(inputs_per_image)
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
@@ -298,13 +289,13 @@ def run_test(
                                                 images=images)
             for prompts, images in inputs_per_image
         ]
-    # AutoModelForCausalLM.register(DeepSeekMultiModalityConfig,
-    #                               MultiModalityCausalLM)
+
     AutoModelForVision2Seq.register(DeepSeekMultiModalityConfig,
                                     MultiModalityCausalLM)
 
     with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
         hf_model.processor = VLChatProcessor.from_pretrained(model)
+        hf_model.model.tokenizer = AutoTokenizer.from_pretrained(model)
 
         hf_outputs_per_image = [
             hf_model.generate_greedy_logprobs_limit(prompts,
@@ -318,8 +309,8 @@ def run_test(
                                         vllm_outputs_per_image):
         # TODO: Check whether using original CLIPVisionModel can improve
         # consistency against HF
-        print(f'hf_outputs: {hf_outputs}')
-        print(f'vllm_outputs: {vllm_outputs}')
+        # print(f'hf_outputs: {hf_outputs}')
+        # print(f'vllm_outputs: {vllm_outputs}')
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=vllm_outputs,
@@ -339,7 +330,7 @@ def run_test(
         [1.0],
         # Single-scale, batched
         [1.0, 1.0, 1.0],
-        # # Multi-scale
+        # Multi-scale
         [0.25, 0.5, 1.0],
     ],
 )

From 8be4a362acaba8fef6aad965c66357f128a059aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Mon, 15 Jul 2024 18:03:36 +0800
Subject: [PATCH 40/47] update test

---
 tests/models/test_deepseek_vl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index 8797769500e91..c8d3ae580937b 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -134,7 +134,7 @@ def generate(self, *args, **kwargs):
         return output
 
     def get_output_embeddings(self):
-        return self.language_model.get_output_embeddings()
+        return None
 
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()

From c105475934c28bd40e84a0c1cabea182dfb11e4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Tue, 16 Jul 2024 14:10:16 +0800
Subject: [PATCH 41/47] fix test failed

---
 tests/models/test_deepseek_vl.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index c8d3ae580937b..e85cf3b48a904 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -16,7 +16,9 @@
 from ..conftest import HfRunner, VllmRunner, _ImageAssets
 from .utils import check_logprobs_close
 
-models = ["deepseek-ai/deepseek-vl-1.3b-chat"]
+models = [
+    "deepseek-ai/deepseek-vl-1.3b-chat", "deepseek-ai/deepseek-vl-7b-chat"
+]
 IMAGE_TOKEN_ID = 100015
 pytestmark = pytest.mark.vlm
 
@@ -78,6 +80,8 @@ def __init__(self, config: DeepSeekMultiModalityConfig):
 
         language_config = config.language_config
         self.language_model = LlamaForCausalLM(language_config)
+        # this model does not support tie_word_embeddings
+        setattr(self.config, 'tie_word_embeddings', False)
 
     def prepare_inputs_embeds(
         self,
@@ -134,7 +138,7 @@ def generate(self, *args, **kwargs):
         return output
 
     def get_output_embeddings(self):
-        return None
+        return self.language_model.get_output_embeddings()
 
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
@@ -270,7 +274,6 @@ def run_test(
         [prompt for _ in size_factors],
         [rescale_image_size(image, factor) for factor in size_factors],
     ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-    print(inputs_per_image)
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
@@ -309,8 +312,6 @@ def run_test(
                                         vllm_outputs_per_image):
         # TODO: Check whether using original CLIPVisionModel can improve
         # consistency against HF
-        # print(f'hf_outputs: {hf_outputs}')
-        # print(f'vllm_outputs: {vllm_outputs}')
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=vllm_outputs,

From 5378c1041c159317071ed71582031dcd08238d70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Tue, 16 Jul 2024 14:43:58 +0800
Subject: [PATCH 42/47] fix test failed

---
 tests/models/test_deepseek_vl.py          | 2 +-
 vllm/model_executor/models/deepseek_vl.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_deepseek_vl.py b/tests/models/test_deepseek_vl.py
index e85cf3b48a904..275748ed34051 100644
--- a/tests/models/test_deepseek_vl.py
+++ b/tests/models/test_deepseek_vl.py
@@ -81,7 +81,7 @@ def __init__(self, config: DeepSeekMultiModalityConfig):
         language_config = config.language_config
         self.language_model = LlamaForCausalLM(language_config)
         # this model does not support tie_word_embeddings
-        setattr(self.config, 'tie_word_embeddings', False)
+        self.config.update({'tie_word_embeddings': False})
 
     def prepare_inputs_embeds(
         self,
diff --git a/vllm/model_executor/models/deepseek_vl.py b/vllm/model_executor/models/deepseek_vl.py
index f79d8b03c1c75..6380410a4d23d 100644
--- a/vllm/model_executor/models/deepseek_vl.py
+++ b/vllm/model_executor/models/deepseek_vl.py
@@ -57,7 +57,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import (cached_get_tokenizer,
                                    repeat_and_pad_image_tokens)
-from vllm.sequence import SamplerOutput
+from vllm.sequence import IntermediateTensors, SamplerOutput
 from vllm.transformers_utils.configs import DeepSeekMultiModalityConfig
 
 from .clip import dummy_seq_data_for_clip
@@ -2287,6 +2287,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         **kwargs: object,
     ):
         pixel_values = kwargs.pop("pixel_values", None)

From e6d1aeb470cbc10c056b1aeba37aa7cd832f284a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Thu, 18 Jul 2024 10:26:06 +0800
Subject: [PATCH 43/47] update example

---
 examples/deepseek_vl_example.py | 64 +++++++++++++--------------------
 1 file changed, 25 insertions(+), 39 deletions(-)

diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py
index 84be2ebd5b397..72f485fc8ddaf 100644
--- a/examples/deepseek_vl_example.py
+++ b/examples/deepseek_vl_example.py
@@ -1,41 +1,43 @@
-import os
-import subprocess
-
-import torch
-from PIL import Image
-
-# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
-# You can use `.buildkite/download-images.sh` to download them
+from vllm.assets.image import ImageAsset
 from vllm import LLM, SamplingParams
 
 sample_params = SamplingParams(temperature=0, max_tokens=1024)
 model = "deepseek-ai/deepseek-vl-7b-chat"
 model = "deepseek-ai/deepseek-vl-1.3b-chat"
-prompt = "You are a helpful language and vision assistant." \
-    "You are able to understand the visual content that the user provides," \
-    "and assist the user with a variety of tasks using natural language.\n" \
-    "User: <image_placeholder> Describe the content of this image.\nAssistant:"
+prompt_one = "You are a helpful language and vision assistant." \
+            "You are able to understand the visual content that the user provides," \
+            "and assist the user with a variety of tasks using natural language.\n" \
+            "User: <image_placeholder> Describe the content of this image.\nAssistant:"\
 
 
-def run_deepseek_vl():
-    llm = LLM(model=model,
-              max_model_len=3072,
-              enforce_eager=True,
-              dtype=torch.bfloat16)
+prompt_two = "You are a helpful language and vision assistant. You are able to " \
+            "understand the visual content that the user provides, and assist the "  \
+            "user with a variety of tasks using natural language.\n User: " \
+            "<image_placeholder>What is the season?\nAssistant:"
 
-    image = Image.open("images/stop_sign.jpg")
 
+def run_deepseek_vl():
+    llm = LLM(model=model)
+    stop_sign_image = ImageAsset("stop_sign").pil_image
+    cherry_blossom_image = ImageAsset("cherry_blossom").pil_image
     outputs = llm.generate(
-        {
-            "prompt": prompt,
+        [{
+            "prompt": prompt_one,
             "multi_modal_data": {
-                "image": image
+                "image": stop_sign_image
             },
-        },
-        sampling_params=sample_params)
+        }, {
+            "prompt": prompt_two,
+            "multi_modal_data": {
+                "image": cherry_blossom_image
+            }
+        }],
+        sampling_params=sample_params,
+    )
 
     for o in outputs:
         generated_text = o.outputs[0].text
+        print("------------------")
         print(generated_text)
 
 
@@ -44,20 +46,4 @@ def main():
 
 
 if __name__ == "__main__":
-    # Download from s3
-    s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
-    local_directory = "images"
-
-    # Make sure the local directory exists or create it
-    os.makedirs(local_directory, exist_ok=True)
-
-    # Use AWS CLI to sync the directory, assume anonymous access
-    subprocess.check_call([
-        "aws",
-        "s3",
-        "sync",
-        s3_bucket_path,
-        local_directory,
-        "--no-sign-request",
-    ])
     main()

From bbef74815d9048a0e4e1f0b223b7c40ae672a006 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Thu, 18 Jul 2024 10:32:49 +0800
Subject: [PATCH 44/47] update example

---
 examples/deepseek_vl_example.py | 43 ++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py
index 72f485fc8ddaf..4e24539d1111d 100644
--- a/examples/deepseek_vl_example.py
+++ b/examples/deepseek_vl_example.py
@@ -4,16 +4,18 @@
 sample_params = SamplingParams(temperature=0, max_tokens=1024)
 model = "deepseek-ai/deepseek-vl-7b-chat"
 model = "deepseek-ai/deepseek-vl-1.3b-chat"
-prompt_one = "You are a helpful language and vision assistant." \
-            "You are able to understand the visual content that the user provides," \
-            "and assist the user with a variety of tasks using natural language.\n" \
-            "User: <image_placeholder> Describe the content of this image.\nAssistant:"\
+prompt_one = (
+    "You are a helpful language and vision assistant."
+    "You are able to understand the visual content that the user provides,"
+    "and assist the user with a variety of tasks using natural language.\n"
+    "User: <image_placeholder> Describe the content of this image.\nAssistant:"
+)
 
-
-prompt_two = "You are a helpful language and vision assistant. You are able to " \
-            "understand the visual content that the user provides, and assist the "  \
-            "user with a variety of tasks using natural language.\n User: " \
-            "<image_placeholder>What is the season?\nAssistant:"
+prompt_two = (
+    "You are a helpful language and vision assistant. You are able to "
+    "understand the visual content that the user provides, and assist the "
+    "user with a variety of tasks using natural language.\n User: "
+    "<image_placeholder>What is the season?\nAssistant:")
 
 
 def run_deepseek_vl():
@@ -21,17 +23,20 @@ def run_deepseek_vl():
     stop_sign_image = ImageAsset("stop_sign").pil_image
     cherry_blossom_image = ImageAsset("cherry_blossom").pil_image
     outputs = llm.generate(
-        [{
-            "prompt": prompt_one,
-            "multi_modal_data": {
-                "image": stop_sign_image
+        [
+            {
+                "prompt": prompt_one,
+                "multi_modal_data": {
+                    "image": stop_sign_image
+                },
+            },
+            {
+                "prompt": prompt_two,
+                "multi_modal_data": {
+                    "image": cherry_blossom_image
+                },
             },
-        }, {
-            "prompt": prompt_two,
-            "multi_modal_data": {
-                "image": cherry_blossom_image
-            }
-        }],
+        ],
         sampling_params=sample_params,
     )
 

From 3552c0321597e937be20dec89603ca90996574a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Mon, 22 Jul 2024 16:27:51 +0800
Subject: [PATCH 45/47] fix ruff error

---
 vllm/transformers_utils/config.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index f1f80599b2ea7..4c56704f669ce 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -5,11 +5,11 @@
 
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
-from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
+from vllm.transformers_utils.configs import (ChameleonConfig, ChatGLMConfig,
                                              DeepSeekMultiModalityConfig,
-                                             JAISConfig, MedusaConfig,
-                                             MLPSpeculatorConfig, MPTConfig,
-                                             RWConfig)
+                                             DbrxConfig, JAISConfig,
+                                             MedusaConfig, MLPSpeculatorConfig,
+                                             MPTConfig, RWConfig)
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -19,6 +19,7 @@
 logger = init_logger(__name__)
 
 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
+    "chameleon": ChameleonConfig,
     "chatglm": ChatGLMConfig,
     "dbrx": DbrxConfig,
     "mpt": MPTConfig,

From d615870a58deee0feb98254ce741474f78e070ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Thu, 25 Jul 2024 14:12:45 +0800
Subject: [PATCH 46/47] fix conflict

---
 docs/source/models/supported_models.rst | 14 +++++++++++---
 vllm/transformers_utils/config.py       |  9 ++++-----
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 1bf6631b5a797..3d765231d1316 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -94,8 +94,8 @@ Decoder-only Language Models
     - :code:`ai21labs/Jamba-v0.1`, etc.
     - ✅︎
   * - :code:`LlamaForCausalLM`
-    - LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
-    - :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
+    - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
+    - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc.
     - ✅︎
   * - :code:`MiniCPMForCausalLM`
     - MiniCPM
@@ -182,6 +182,10 @@ Vision Language Models
     - Models
     - Example HuggingFace Models
     - :ref:`LoRA <lora>`
+  * - :code:`ChameleonForConditionalGeneration`
+    - Chameleon
+    - :code:`facebook/chameleon-7b` etc.
+    - 
   * - :code:`FuyuForCausalLM`
     - Fuyu
     - :code:`adept/fuyu-8b` etc.
@@ -202,6 +206,10 @@ Vision Language Models
     - Phi-3-Vision
     - :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
     -
+  * - :code:`MiniCPM-V`
+    - MiniCPM-V
+    - :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
+    -
   * - :code:`DeepSeekMultiModalityCausalLM`
     - deepseek-ai
     - :code:`deepseek-ai/deepseek-vl-1.3b-chat`, :code:`deepseek-ai/deepseek-vl-7b-chat`, etc.
@@ -267,4 +275,4 @@ We have the following levels of testing for models:
 1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `test_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_models.py>`_ and `test_big_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_big_models.py>`_ for the models that have passed this test.
 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
 3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests <https://github.com/vllm-project/vllm/tree/main/tests>`_ and `examples <https://github.com/vllm-project/vllm/tree/main/examples>`_ for the models that have passed this test.
-4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
+4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
\ No newline at end of file
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 4c56704f669ce..f1f80599b2ea7 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -5,11 +5,11 @@
 
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
-from vllm.transformers_utils.configs import (ChameleonConfig, ChatGLMConfig,
+from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
                                              DeepSeekMultiModalityConfig,
-                                             DbrxConfig, JAISConfig,
-                                             MedusaConfig, MLPSpeculatorConfig,
-                                             MPTConfig, RWConfig)
+                                             JAISConfig, MedusaConfig,
+                                             MLPSpeculatorConfig, MPTConfig,
+                                             RWConfig)
 
 if VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
@@ -19,7 +19,6 @@
 logger = init_logger(__name__)
 
 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
-    "chameleon": ChameleonConfig,
     "chatglm": ChatGLMConfig,
     "dbrx": DbrxConfig,
     "mpt": MPTConfig,

From f48ba9b15086f3d82d371699d42482d36b019079 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E8=A1=8D=E8=81=AA=5FYancong?=
 <sp.yancong.liu@enflame.cn>
Date: Thu, 25 Jul 2024 16:42:27 +0800
Subject: [PATCH 47/47] delete useless code

---
 examples/deepseek_vl_example.py   | 2 +-
 vllm/transformers_utils/config.py | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/deepseek_vl_example.py b/examples/deepseek_vl_example.py
index 4e24539d1111d..38e2feb02564d 100644
--- a/examples/deepseek_vl_example.py
+++ b/examples/deepseek_vl_example.py
@@ -1,5 +1,5 @@
-from vllm.assets.image import ImageAsset
 from vllm import LLM, SamplingParams
+from vllm.assets.image import ImageAsset
 
 sample_params = SamplingParams(temperature=0, max_tokens=1024)
 model = "deepseek-ai/deepseek-vl-7b-chat"
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index f1f80599b2ea7..652505a892142 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -6,7 +6,6 @@
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
-                                             DeepSeekMultiModalityConfig,
                                              JAISConfig, MedusaConfig,
                                              MLPSpeculatorConfig, MPTConfig,
                                              RWConfig)
@@ -27,7 +26,6 @@
     "jais": JAISConfig,
     "mlp_speculator": MLPSpeculatorConfig,
     "medusa": MedusaConfig,
-    "multi_modality": DeepSeekMultiModalityConfig,
 }
 
 for name, cls in _CONFIG_REGISTRY.items():